We have built anagent that learns the numeric tic tac toe game by Q-Learning.There is no change to...

Question

We have built anagent that learns the numeric tic tac toe game by Q-Learning.There is no change to this as this is working fine.

Requirement

Check whether Q-values learnt by the agent have converged or not. Sample any 4 state-action pairs and plot it with the number of episodes to understand the convergence.Make use of the given functions in the environment and agent file and the output file should be in line with Testing_States_tracked.IPYNB

Ujjwal · Accepted Answer

TCGame_Env.py
from __future__ import print_function
import argparse
import matplotlib.pylab as plt
import os
import pickle
import random
import sys
from agent import QLearner, SarsaLearner, Teacher
def plot_agent_reward(rewards, agent_type):
    """ Function to plot agent's accumulated reward vs. episode """
    plt.plot(rewards)
    if agent_type == 'q':
        plt.title('Q-Learning Agent Cumulative Reward vs. Episode')
    else:
        plt.title('Sarsa Agent Cumulative Reward vs. Episode')
    plt.ylabel('Reward')
    plt.xlabel('Episode')
    plt.show()
class Game(object):
    """ The game class. New instance created for each new game. """
    def __init__(self, agent, teacher=None):
        self.computer = agent
        self.teacher = teacher
        # initialize the game board
        self.board = [['-', '-', '-'], ['-', '-', '-'], ['-', '-', '-']]
    def printBoard(self):
        """ Prints the game board as text output to the terminal. """
        print('    0   1   2
')
        row_num = 0
        for row in self.board:
            print('%i   ' % row_num, end='')
            for elt in row:
                print('%s   ' % elt, end='')
            print('
')
            row_num += 1
        print('
')
    def playerMove(self):
        """ Querry player for a move and update the board accordingly. """
        if self.teacher is not None:
            action = self.teacher.makeMove(self.board)
            self.board[action[0]][action[1]] = 'X'
        else:
            self.printBoard()
            while True:
                move = raw_input("Your move! Please select a row and column from 0-2 "
                                 "in the format row,col: ")
                try:
                    row, col = int(move[0]), int(move[2])
                except ValueError:
                    print("INVALID INPUT! Please use the correct format.")
                    continue
                if row not in range(3) or col not in range(3) or not self.board[row][col] == '-':
                    print("INVALID MOVE! Choose again.")
                    continue
                self.board[row][col] = 'X'
                break
    def computerMove(self, action):
        """ Update board according to computer move. """
        self.board[action[0]][action[1]] = 'O'
    def checkForWin(self, key):
        """
        Check to see whether the player/agent with token 'key' has won.
        Returns a boolean holding truth value.
        """
        # check for player win on diagonals
        a = [self.board[0][0], self.board[1][1], self.board[2][2]]
        b = [self.board[0][2], self.board[1][1], self.board[2][0]]
        if a.count(key) == 3 or b.count(key) == 3:
            return True
        # check for player win on rows/columns
        for i in range(3):
            col = [self.board[0][i], self.board[1][i], self.board[2][i]]
            row = [self.board[i][0], self.board[i][1], self.board[i][2]]
            if col.count(key) == 3 or row.count(key) == 3:
                return True
        return False
    def checkForDraw(self):
        """
        Check to see whether the game has ended in a draw. Returns a
        boolean holding truth value.
        """
        draw = True
        for row in self.board:
            for elt in row:
                if elt == '-':
                    draw = False
        return draw
    def checkForEnd(self, key):
        """
        Checks if player/agent with token 'key' has ended the game. Returns -1
        if the game is still going, 0 if it is a draw, and 1 if the player/agent
        has won.
        """
        if self.checkForWin(key):
            if self.teacher is None:
                self.printBoard()
                if key == 'X':
                    print("Player wins!")
                else:
                    print("RL agent wins!")
            return 1
        elif self.checkForDraw():
            if self.teacher is None:
                self.printBoard()
                print("It's a draw!")
            return 0
        return -1
    def getStateKey(self):
        """
        Converts 2D list representing the board state into a string key
        for that state. Keys are used for Q-value hashing.
        """
        key = ''
        for row in self.board:
            for elt in row:
                key += elt
        return key
    def playGame(self, agent_type, player_first):
        """ Begin the tic-tac-toe game loop. """
        # Initialize the agent's state and action
        if player_first:
            self.playerMove()
        oldState = self.getStateKey()
        if agent_type == 's':
            oldAction = self.computer.get_action(oldState)
        if agent_type == 'q':
            # Dealing with QLearner agent
            while True:
                action = self.computer.get_action(oldState)
                self.computerMove(action)
                check = self.checkForEnd('O')
                if not check == -1:
                    reward = check
                    break
                self.playerMove()
                state = self.getStateKey()
                check = self.checkForEnd('X')
                if not check == -1:
                    reward = -1*check
                    break
                else:
                    reward = 0
                self.computer.update(oldState, state, action, reward)
                oldState = state
        else:
            # Dealing with Sarsa agent
            while True:
                self.computerMove(oldAction)
                check = self.checkForEnd('O')
                if not check == -1:
                    reward = check
                    break
                self.playerMove()
                check = self.checkForEnd('X')
                if not check == -1:
                    reward = -1*check
                    break
                else:
                    reward = 0
                state = self.getStateKey()
                action = self.computer.get_action(state)
                self.computer.update(oldState, state, oldAction, action, reward)
                oldState = state
                oldAction = action
        self.computer.total_reward += reward
        self.computer.rewards += [self.computer.total_reward]
        # Final update and save
        if agent_type == 'q':
            self.computer.update(oldState, None, action, reward)
            self.computer.save_agent('./qlearner_agent.pkl')
        else:
            self.computer.update(oldState, None, oldAction, None, reward)
            self.computer.save_agent('./sarsa_agent.pkl')
    def start(self, agent_type):
        """
        Function to determine how to play. Options include whether to employ
        teacher and whether to have computer or player go first.
        """
        if self.teacher is not None:
            # During teaching, chose who goes first randomly with equal probability
            if random.random()  Q_max:
                    Q_max = self.Q[a][s]
                    action = a
        return action
    def save_agent(self, path):
        """ Pickle the agent object instance to save the agent's state. """
        if os.path.isfile(path):
            os.remove(path)
        f = open(path, 'wb')
        pickle.dump(self, f)
        f.close()
class QLearner(Learner):
    """
    A class to implement the Q-learning agent.
    """
    def __init__(self, alpha, gamma, epsilon):
        Learner.__init__(self, alpha, gamma, epsilon)
    def update(self, s, s_, a, r):
        """ Perform the Q-Learning step update of Q values. """
        # Update Q(s,a)
        if s_ is not None:
            # Hold list of Q values for all a_,s_ pairs so we can access max later
            Q_options = []
            for action in self.actions:
                Q_options += [self.Q[action][s_]]
            self.Q[a][s] = (1 - self.alpha)*self.Q[a][s] + self.alpha*(r + self.gamma*max(Q_options))
        else:
            self.Q[a][s] = (1 - self.alpha)*self.Q[a][s] + self.alpha*r
class SarsaLearner(Learner):
    """
    A class to implement the Sarsa-learning agent.
    """
    def __init__(self, alpha, gamma, epsilon):
        Learner.__init__(self, alpha, gamma, epsilon)
    def update(self, s, s_, a, a_, r):
        """ Perform the Sarsa step update of Q values. """
        # Update Q(s,a)
        if s_ is not None:
            self.Q[a][s] = (1 - self.alpha)*self.Q[a][s] + self.alpha*(r + self.gamma*self.Q[a_][s_])
        else:
            self.Q[a][s] = (1 - self.alpha)*self.Q[a][s] + self.alpha*r
class Teacher(object):
    """ A class to implement a teacher that knows the optimal playing strategy.
    Teacher returns the best move at any time given the current state of the game.
    Note: things are a bit more hard-coded here, as this was not the main focus of
    the exercise so I did not spend as much time on design/style. Everything works
    properly when tested."""
    def __init__(self, level=0.9):
        """
        Ability level determines the probability that the teacher will follow
        the optimal strategy as opposed to choosing a random available move.
        """
        self.ability_level = level
    def win(self, board, key='X'):
        """ If we have two in a row and the 3rd is available, take it. """
        # Check for diagonal wins
        a = [board[0][0], board[1][1], board[2][2]]
        b = [board[0][2], board[1][1], board[2][0]]
        if a.count('-') == 1 and a.count(key) == 2:
            ind = a.index('-')
            return ind, ind
        elif b.count('-') == 1 and b.count(key) == 2:
            ind = b.index('-')
            if ind == 0:
                return 0, 2
            elif ind == 1:
                return 1, 1
            else:
                return 2, 0
        # Now check for 2 in a row/column + empty 3rd
        for i in range(3):
            c = [board[0][i], board[1][i], board[2][i]]
            d = [board[i][0], board[i][1], board[i][2]]
            if c.count('-') == 1 and c.count(key) == 2:
                ind = c.index('-')
                return ind, i
            elif d.count('-') == 1 and d.count(key) == 2:
                ind = d.index('-')
                return i, ind
        return None
    def blockWin(self, board):
        """ Block the opponent if she has a win available. """
        return self.win(board, key='O')
    def fork(self, board):
        """ Create a fork opportunity such that we have 2 threats to win. """
        # Check all adjacent side middles
        if board[1][0] == 'X' and board[0][1] == 'X':
            if board[0][0] == '-' and board[2][0] == '-' and board[0][2] == '-':
                return 0, 0
            elif board[1][1] == '-' and board[2][1] == '-' and board[1][2] == '-':
                return 1, 1
        elif board[1][0] == 'X' and board[2][1] == 'X':
            if board[2][0] == '-' and board[0][0] == '-' and board[2][2] == '-':
                return 2, 0
            elif board[1][1] == '-' and board[0][1] == '-' and board[1][2] == '-':
                return 1, 1
        elif board[2][1] == 'X' and board[1][2] == 'X':
            if board[2][2] == '-' and board[2][0] == '-' and board[0][2] == '-':
                return 2, 2
            elif board[1][1] == '-' and board[1][0] == '-' and board[0][1] == '-':
                return 1, 1
        elif board[1][2] == 'X' and board[0][1] == 'X':
            if board[0][2] == '-' and board[0][0] == '-' and board[2][2] == '-':
                return 0, 2
            elif board[1][1] == '-' and board[1][0] == '-' and board[2][1] == '-':
                return 1, 1
        # Check all cross corners
        elif board[0][0] == 'X' and board[2][2] == 'X':
            if board[1][0] == '-' and board[2][1] == '-' and board[2][0] == '-':
                return 2, 0
            elif board[0][1] == '-' and board[1][2] == '-' and board[0][2] == '-':
                return 0, 2
        elif board[2][0] == 'X' and board[0][2] == 'X':
            if board[2][1] == '-' and board[1][2] == '-' and board[2][2] == '-':
                return 2, 2
            elif board[1][0] == '-' and board[0][1] == '-' and board[0][0] == '-':
                return 0, 0
        return None
    def blockFork(self, board):
        """ Block the opponents fork if she has one available. """
        corners = [board[0][0], board[2][0], board[0][2], board[2][2]]
        # Check all adjacent side middles
        if board[1][0] == 'O' and board[0][1] == 'O':
            if board[0][0] == '-' and board[2][0] == '-' and board[0][2] == '-':
                return 0, 0
            elif board[1][1] == '-' and board[2][1] == '-' and board[1][2] == '-':
                return 1, 1
        elif board[1][0] == 'O' and board[2][1] == 'O':
            if board[2][0] == '-' and board[0][0] == '-' and board[2][2] == '-':
                return 2, 0
            elif board[1][1] == '-' and board[0][1] == '-' and board[1][2] == '-':
                return 1, 1
        elif board[2][1] == 'O' and board[1][2] == 'O':
            if board[2][2] == '-' and board[2][0] == '-' and board[0][2] == '-':
                return 2, 2
            elif board[1][1] == '-' and board[1][0] == '-' and board[0][1] == '-':
                return 1, 1
        elif board[1][2] == 'O' and board[0][1] == 'O':
            if board[0][2] == '-' and board[0][0] == '-' and board[2][2] == '-':
                return 0, 2
            elif board[1][1] == '-' and board[1][0] == '-' and board[2][1] == '-':
                return 1, 1
        # Check all cross corners (first check for double fork opp using the corners array)
        elif corners.count('-') == 1 and corners.count('O') == 2:
            return 1, 2
        elif board[0][0] == 'O' and board[2][2] == 'O':
            if board[1][0] == '-' and board[2][1] == '-' and board[2][0] == '-':
                return 2, 0
            elif board[0][1] == '-' and board[1][2] == '-' and board[0][2] == '-':
                return 0, 2
        elif board[2][0] == 'O' and board[0][2] == 'O':
            if board[2][1] == '-' and board[1][2] == '-' and board[2][2] == '-':
                return 2, 2
            elif board[1][0] == '-' and board[0][1] == '-' and board[0][0] == '-':
                return 0, 0
        return None
    def center(self, board):
        """ Pick the center if it is available. """
        if board[1][1] == '-':
            return 1, 1
        return None
    def corner(self, board):
        """ Pick a corner move. """
        # Pick opposite corner of opponent if available
        if board[0][0] == 'O' and board[2][2] == '-':
            return 2, 2
        elif board[2][0] == 'O' and board[0][2] == '-':
            return 0, 2
        elif board[0][2] == 'O' and board[2][0] == '-':
            return 2, 0
        elif board[2][2] == 'O' and board[0][0] == '-':
            return 0, 0
        # Pick any corner if no opposites are available
        elif board[0][0] == '-':
            return 0, 0
        elif board[2][0] == '-':
            return 2, 0
        elif board[0][2] == '-':
            return 0, 2
        elif board[2][2] == '-':
            return 2, 2
        return None
    def sideEmpty(self, board):
        """ Pick an empty side. """
        if board[1][0] == '-':
            return 1, 0
        elif board[2][1] == '-':
            return 2, 1
        elif board[1][2] == '-':
            return 1, 2
        elif board[0][1] == '-':
            return 0, 1
        return None
    def randomMove(self, board):
        """ Chose a random move from the available options. """
        possibles = []
        for i in range(3):
            for j in range(3):
                if board[i][j] == '-':
                    possibles += [(i, j)]
        return possibles[random.randint(0, len(possibles)-1)]
    def makeMove(self, board):
        """
        Trainer goes through a hierarchy of moves, making the best move that
        is currently available each time. A touple is returned that represents
        (row, col).
        """
        # Chose randomly with some probability so that the teacher does not always win
        if random.random() > self.ability_level:
            return self.randomMove(board)
        # Follow optimal strategy
        a = self.win(board)
        if a is not None:
            return a
        a = self.blockWin(board)
        if a is not None:
            return a
        a = self.fork(board)
        if a is not None:
            return a
        a = self.blockFork(board)
        if a is not None:
            return a
        a = self.center(board)
        if a is not None:
            return a
        a = self.corner(board)
        if a is not None:
            return a
        a = self.sideEmpty(board)
        if a is not None:
            return a
        return self.randomMove(board)
plots/QLearner_0_6.png
plots/QLearner_0_7.png
plots/QLearner_0_9.png
plots/SarsaLearner_0_6.png
plots/SarsaLearner_0_7.png
plots/SarsaLearner_0_9.png
qlearner_agent.

We have built an agent that learns the numeric tic tac toe game by Q-Learning .There is no change to this as this is working fine. Requirement Check whether Q-values learnt by the agent have converged...

Answer To: We have built an agent that learns the numeric tic tac toe game by Q-Learning .There is no change to...

Answer To This Question Is Available To Download

Related Questions & Answers

Submit New Assignment