We have built an agent that learns the numeric tic tac toe game by Q-Learning .There is no change to this as this is working fine. Requirement Check whether Q-values learnt by the agent have converged...

1 answer below ยป

We have built anagent that learns the numeric tic tac toe game by Q-Learning.There is no change to this as this is working fine.



Requirement


Check whether Q-values learnt by the agent have converged or not. Sample any 4 state-action pairs and plot it with the number of episodes to understand the convergence.Make use of the given functions in the environment and agent file and the output file should be in line with Testing_States_tracked.IPYNB

Answered Same DayFeb 02, 2021

Answer To: We have built an agent that learns the numeric tic tac toe game by Q-Learning .There is no change to...

Ujjwal answered on Feb 03 2021
142 Votes
TCGame_Env.py
from __future__ import print_function
import argparse
import matplotlib.pylab as plt
import os
import pickle
import random
import sys
from agent import QLearner, SarsaLearner, Teacher
def plot_agent_reward(rewards, agent_type):
""" Function to plot agent's accumulated reward vs. episode """
plt.plot(rewards)
if agent_type == 'q':
plt.title('Q-Learning Agent Cumulative Reward vs. Episode')
else:
plt.title('Sarsa Agent Cumulative Reward vs. Episode')
plt.ylabel('Reward')
plt.xlabel('Episode')
plt.show()
class Game(object):
""" The game class. New instance created for each new game. """
def __init__(self, agent, teacher=None):
self.computer = agent
self.teacher = teacher
# initialize the game board
self.board = [['-', '-', '-'], ['-', '-', '-'], ['-', '-', '-']]
def printBoard(self):
""" Prints the game board as text output to the terminal. """
print(' 0 1 2\n')
row_num = 0
for row in self.board:
print('%i ' % row_num, end='')
for elt in row:
print('%s ' % elt, end='')
print('\n')
row_num += 1
print('\n')
def playerMove(self):
""" Querry player for a move and update the board accordingly. """
if self.teacher is not None:
action = self.teacher.makeMove(self.board)
self.board[action[0]][action[1]] = 'X'
else:
self.printBoard()
while True:
move = raw_input("Your move! Please select a row and column from 0-2 "
"in the format row,col: ")
try:
row, col = int(move[0]), int(move[2])
except ValueError:
print("INVALID INPUT! Please use the correct format.")
continue
if row not in range(3) or col not in range(3) or not self.board[row][col] == '-':
print("INVALID MOVE! Choose again.")
continue
self.board[row][col] = 'X'
break
def computerMove(self, action):
""" Update board according to computer move. """
self.board[action[0]][action[1]] = 'O'
def checkForWin(self, key):
"""
Check to see whether the player/agent with token 'key' has won.
Returns a boolean holding truth value.
"""
# check for player win on diagonals
a = [self.board[0][0], self.board[1][1],
self.board[2][2]]
b = [self.board[0][2], self.board[1][1], self.board[2][0]]
if a.count(key) == 3 or b.count(key) == 3:
return True
# check for player win on rows/columns
for i in range(3):
col = [self.board[0][i], self.board[1][i], self.board[2][i]]
row = [self.board[i][0], self.board[i][1], self.board[i][2]]
if col.count(key) == 3 or row.count(key) == 3:
return True
return False
def checkForDraw(self):
"""
Check to see whether the game has ended in a draw. Returns a
boolean holding truth value.
"""
draw = True
for row in self.board:
for elt in row:
if elt == '-':
draw = False
return draw
def checkForEnd(self, key):
"""
Checks if player/agent with token 'key' has ended the game. Returns -1
if the game is still going, 0 if it is a draw, and 1 if the player/agent
has won.
"""
if self.checkForWin(key):
if self.teacher is None:
self.printBoard()
if key == 'X':
print("Player wins!")
else:
print("RL agent wins!")
return 1
elif self.checkForDraw():
if self.teacher is None:
self.printBoard()
print("It's a draw!")
return 0
return -1
def getStateKey(self):
"""
Converts 2D list representing the board state into a string key
for that state. Keys are used for Q-value hashing.
"""
key = ''
for row in self.board:
for elt in row:
key += elt
return key
def playGame(self, agent_type, player_first):
""" Begin the tic-tac-toe game loop. """
# Initialize the agent's state and action
if player_first:
self.playerMove()
oldState = self.getStateKey()
if agent_type == 's':
oldAction = self.computer.get_action(oldState)
if agent_type == 'q':
# Dealing with QLearner agent
while True:
action = self.computer.get_action(oldState)
self.computerMove(action)
check = self.checkForEnd('O')
if not check == -1:
reward = check
break
self.playerMove()
state = self.getStateKey()
check = self.checkForEnd('X')
if not check == -1:
reward = -1*check
break
else:
reward = 0
self.computer.update(oldState, state, action, reward)
oldState = state
else:
# Dealing with Sarsa agent
while True:
self.computerMove(oldAction)
check = self.checkForEnd('O')
if not check == -1:
reward = check
break
self.playerMove()
check = self.checkForEnd('X')
if not check == -1:
reward = -1*check
break
else:
reward = 0
state = self.getStateKey()
action = self.computer.get_action(state)
self.computer.update(oldState, state, oldAction, action, reward)
oldState = state
oldAction = action
self.computer.total_reward += reward
self.computer.rewards += [self.computer.total_reward]
# Final update and save
if agent_type == 'q':
self.computer.update(oldState, None, action, reward)
self.computer.save_agent('./qlearner_agent.pkl')
else:
self.computer.update(oldState, None, oldAction, None, reward)
self.computer.save_agent('./sarsa_agent.pkl')
def start(self, agent_type):
"""
Function to determine how to play. Options include whether to employ
teacher and whether to have computer or player go first.
"""
if self.teacher is not None:
# During teaching, chose who goes first randomly with equal probability
if random.random() < 0.5:
self.playGame(agent_type, False)
else:
self.playGame(agent_type, True)
else:
while True:
response = raw_input("Would you like to go first? [y/n]: ")
if response == 'n' or response == 'no':
#self.playComputerFirst(agent_type)
self.playGame(agent_type, False)
break
elif response == 'y' or response == 'yes':
#self.playPlayerFirst(agent_type)
self.playGame(agent_type, True)
break
else:
print("Invalid input. Please enter 'y' or 'n'.")
class GameLearning(object):
"""
A class that holds the state of the learning process. Learning
agents are created/loaded here, and a count is kept of the
games that have been played.
"""
def __init__(self, args, alpha=0.5, gamma=0.9, epsilon=0.1):
self.games_played = 0
if args.load:
# load agent
if args.learner_type == 'q':
# QLearner
try:
f = open('./qlearner_agent.pkl','rb')
except IOError:
print("The agent file does not exist. Quitting.")
sys.exit(0)
self.type = 'q'
else:
# SarsaLearner
try:
f = open('./sarsa_agent.pkl','rb')
except IOError:
print("The agent file does not exist. Quitting.")
sys.exit(0)
self.type = 's'
self.agent = pickle.load(f)
f.close()
# If plotting, show plot and quit
if args.plot:
plot_agent_reward(self.agent.rewards, self.type)
sys.exit(0)
else:
# check if agent state file already exists, and ask user whether to overwrite if so
if ((args.learner_type == "q" and os.path.isfile('./qlearner_agent.pkl')) or
(args.learner_type == "s" and os.path.isfile('./qlearner_agent.pkl'))):
while True:
response = raw_input("An agent state is already saved for this type. "
"Are you sure you want to overwrite? [y/n]: ")
if response == 'y' or response == 'yes':
break
elif response == 'n' or response == 'no':
print("OK. Quitting.")
sys.exit(0)
else:
print("Invalid input. Please choose 'y' or 'n'.")
if args.learner_type == "q":
self.agent = QLearner(alpha,gamma,epsilon)
self.type = 'q'
else:
self.agent = SarsaLearner(alpha,gamma,epsilon)
self.type = 's'
def beginPlaying(self):
""" Loop through game iterations with a human player. """
print("Welcome to Tic-Tac-Toe. You are 'X' and the computer is 'O'.")
def play_again():
print("Games played: %i" % self.games_played)
while True:
play = raw_input("Do you want to play again? [y/n]: ")
if play == 'y' or play == 'yes':
return True
elif play == 'n' or play == 'no':
return False
else:
print("Invalid input. Please choose 'y' or 'n'.")
while True:
game = Game(self.agent)
game.start(self.type)
self.games_played += 1
if not play_again():
print("OK. Quitting.")
break
def beginTeaching(self, episodes):
""" Loop through game iterations with a teaching agent. """
teacher = Teacher()
# Train for alotted number of episodes
while self.games_played < episodes:
game = Game(self.agent, teacher=teacher)
game.start(self.type)
self.games_played += 1
# Monitor progress
if self.games_played % 500 == 0:
print("Games played: %i" % self.games_played)
if __name__ == "__main__":
# Parse command line arguments
parser = argparse.ArgumentParser(description="Play Tic-Tac-Toe.")
parser.add_argument("learner_type", help="Specify the computer agent learning algorithm."
"'q' for Q-learning and 's' for Sarsa-learning",
type=str, default="q")
parser.add_argument("-l", "--load", help="load trained agent", action="store_true")
parser.add_argument("-t", "--teacher", help="employ teacher agent who knows the optimal strategy",
default=None, type=int)
parser.add_argument("-p", "--plot", help="plot reward vs. episode of stored agent and quit",
action="store_true")
args = parser.parse_args()
assert args.learner_type == 'q' or args.learner_type == 's', \
"learner type must be either 'q' or 's'."
if args.plot:
assert args.load, "Must load an agent to plot reward."
assert args.teacher is None, \
"Cannot plot and teach concurrently; must chose one or the other."
gl = GameLearning(args)
if args.teacher is not None:
gl.beginTeaching(args.teacher)
else:
gl.beginPlaying()
Testing_States_tracked.ipynb.py
import unittest
from agent import QLearner, SarsaLearner, Teacher
from game import Game, GameLearning
class TestGameAndAgents(unittest.TestCase):
def setUp(self):
# Use epsilon = 0 so that actions are deterministic
# and therefore testable
self.q_agent = QLearner(0.5, 0.9, 0)
self.s_agent = SarsaLearner(0.5, 0.9, 0)
# test deterministic teacher
self.teacher = Teacher(1)
self.game = Game(self.q_agent)
def testGame(self):
self.game.computerMove((1,1))
self.assertEqual(self.game.board[1][1], 'O')
with self.assertRaises(IndexError):
self.game.computerMove((3, 3))
self.assertFalse(self.game.checkForWin('O'))
self.assertFalse(self.game.checkForDraw())
self.assertEqual(self.game.checkForEnd('O'), -1)
self.game.computerMove((0, 0))
self.game.computerMove((2, 2))
self.assertTrue(self.game.checkForWin('O'))
self.assertFalse(self.game.checkForDraw())
self.assertEqual(self.game.checkForEnd('O'), 1)
def testLearningAgents(self):
a1 = self.q_agent.get_action('---------')
self.assertEqual(a1, (0, 0))
a2 = self.q_agent.get_action('O--------')
self.assertEqual(a2, (1, 0))
self.q_agent.update('---------', '----0X---', (1, 1), 1)
self.s_agent.update('---------', '----0X---', (1, 1), (2, 1), -1)
self.assertEqual(self.q_agent.Q[(1, 1)]['---------'], 0.5)
self.assertEqual(self.s_agent.Q[(1, 1)]['---------'], -0.5)
def testTeachingAgent(self):
board1 = [['-', '-', '-'], ['-', '-', '-'], ['-', '-', '-']]
board2 = [['X', '-', 'O'], ['O', 'X', '-'], ['-', '-', '-']]
# Should be center
self.assertEqual(self.teacher.makeMove(board1), (1, 1))
# Should be corner for win
self.assertEqual(self.teacher.makeMove(board2), (2, 2))
if __name__ == '__main__':
unittest.main()
TicTacToe_Agent.ipynb.py
import collections
import numpy as np
import os
import pickle
import random
class Learner(object):
"""
Parent class for Q-learning and Sarsa-learning agents.
"""
def __init__(self, alpha, gamma, epsilon):
# Reward accumulator
self.total_reward = 0
# Keep a list of accumulated award for each episode
self.rewards = []
self.alpha = alpha
self.gamma = gamma
self.epsilon = epsilon
# Possible actions correspond to the set of all x,y coordinate pairs
self.actions = []
for i in range(3):
self.actions += [(0, i), (1, i), (2, i)]
self.Q = {}
for action in self.actions:
# Initialize Q values of all state-action pairs to 0
self.Q[action] = collections.defaultdict(int)
def get_action(self, s):
# Make sure we only consider empty board spaces
possible_actions = [a for a in self.actions if s[a[0]*3 + a[1]] == '-']
if random.random() < self.epsilon:
# Random choose.
action = possible_actions[random.randint(0,len(possible_actions)-1)]
else:
# Greedy choose. At least one action will always be possible
# when this function is called.
Q_max = -np.inf
for a in possible_actions:
if self.Q[a][s] > Q_max:
Q_max = self.Q[a][s]
action = a
return action
def save_agent(self, path):
""" Pickle the agent object instance to save the agent's state. """
if os.path.isfile(path):
os.remove(path)
f = open(path, 'wb')
pickle.dump(self, f)
f.close()
class QLearner(Learner):
"""
A class to implement the Q-learning agent.
"""
def __init__(self, alpha, gamma, epsilon):
Learner.__init__(self, alpha, gamma, epsilon)
def update(self, s, s_, a, r):
""" Perform the Q-Learning step update of Q values. """
# Update Q(s,a)
if s_ is not None:
# Hold list of Q values for all a_,s_ pairs so we can access max later
Q_options = []
for action in self.actions:
Q_options += [self.Q[action][s_]]
self.Q[a][s] = (1 - self.alpha)*self.Q[a][s] + self.alpha*(r + self.gamma*max(Q_options))
else:
self.Q[a][s] = (1 - self.alpha)*self.Q[a][s] + self.alpha*r
class SarsaLearner(Learner):
"""
A class to implement the Sarsa-learning agent.
"""
def __init__(self, alpha, gamma, epsilon):
Learner.__init__(self, alpha, gamma, epsilon)
def update(self, s, s_, a, a_, r):
""" Perform the Sarsa step update of Q values. """
# Update Q(s,a)
if s_ is not None:
self.Q[a][s] = (1 - self.alpha)*self.Q[a][s] + self.alpha*(r + self.gamma*self.Q[a_][s_])
else:
self.Q[a][s] = (1 - self.alpha)*self.Q[a][s] + self.alpha*r
class Teacher(object):
""" A class to implement a teacher that knows the optimal playing strategy.
Teacher returns the best move at any time given the current state of the game.
Note: things are a bit more hard-coded here, as this was not the main focus of
the exercise so I did not spend as much time on design/style. Everything works
properly when tested."""
def __init__(self, level=0.9):
"""
Ability level determines the probability that the teacher will follow
the optimal strategy as opposed to choosing a random available move.
"""
self.ability_level = level
def win(self, board, key='X'):
""" If we have two in a row and the 3rd is available, take it. """
# Check for diagonal wins
a = [board[0][0], board[1][1], board[2][2]]
b = [board[0][2], board[1][1], board[2][0]]
if a.count('-') == 1 and a.count(key) == 2:
ind = a.index('-')
return ind, ind
elif b.count('-') == 1 and b.count(key) == 2:
ind = b.index('-')
if ind == 0:
return 0, 2
elif ind == 1:
return 1, 1
else:
return 2, 0
# Now check for 2 in a row/column + empty 3rd
for i in range(3):
c = [board[0][i], board[1][i], board[2][i]]
d = [board[i][0], board[i][1], board[i][2]]
if c.count('-') == 1 and c.count(key) == 2:
ind = c.index('-')
return ind, i
elif d.count('-') == 1 and d.count(key) == 2:
ind = d.index('-')
return i, ind
return None
def blockWin(self, board):
""" Block the opponent if she has a win available. """
return self.win(board, key='O')
def fork(self, board):
""" Create a fork opportunity such that we have 2 threats to win. """
# Check all adjacent side middles
if board[1][0] == 'X' and board[0][1] == 'X':
if board[0][0] == '-' and board[2][0] == '-' and board[0][2] == '-':
return 0, 0
elif board[1][1] == '-' and board[2][1] == '-' and board[1][2] == '-':
return 1, 1
elif board[1][0] == 'X' and board[2][1] == 'X':
if board[2][0] == '-' and board[0][0] == '-' and board[2][2] == '-':
return 2, 0
elif board[1][1] == '-' and board[0][1] == '-' and board[1][2] == '-':
return 1, 1
elif board[2][1] == 'X' and board[1][2] == 'X':
if board[2][2] == '-' and board[2][0] == '-' and board[0][2] == '-':
return 2, 2
elif board[1][1] == '-' and board[1][0] == '-' and board[0][1] == '-':
return 1, 1
elif board[1][2] == 'X' and board[0][1] == 'X':
if board[0][2] == '-' and board[0][0] == '-' and board[2][2] == '-':
return 0, 2
elif board[1][1] == '-' and board[1][0] == '-' and board[2][1] == '-':
return 1, 1
# Check all cross corners
elif board[0][0] == 'X' and board[2][2] == 'X':
if board[1][0] == '-' and board[2][1] == '-' and board[2][0] == '-':
return 2, 0
elif board[0][1] == '-' and board[1][2] == '-' and board[0][2] == '-':
return 0, 2
elif board[2][0] == 'X' and board[0][2] == 'X':
if board[2][1] == '-' and board[1][2] == '-' and board[2][2] == '-':
return 2, 2
elif board[1][0] == '-' and board[0][1] == '-' and board[0][0] == '-':
return 0, 0
return None
def blockFork(self, board):
""" Block the opponents fork if she has one available. """
corners = [board[0][0], board[2][0], board[0][2], board[2][2]]
# Check all adjacent side middles
if board[1][0] == 'O' and board[0][1] == 'O':
if board[0][0] == '-' and board[2][0] == '-' and board[0][2] == '-':
return 0, 0
elif board[1][1] == '-' and board[2][1] == '-' and board[1][2] == '-':
return 1, 1
elif board[1][0] == 'O' and board[2][1] == 'O':
if board[2][0] == '-' and board[0][0] == '-' and board[2][2] == '-':
return 2, 0
elif board[1][1] == '-' and board[0][1] == '-' and board[1][2] == '-':
return 1, 1
elif board[2][1] == 'O' and board[1][2] == 'O':
if board[2][2] == '-' and board[2][0] == '-' and board[0][2] == '-':
return 2, 2
elif board[1][1] == '-' and board[1][0] == '-' and board[0][1] == '-':
return 1, 1
elif board[1][2] == 'O' and board[0][1] == 'O':
if board[0][2] == '-' and board[0][0] == '-' and board[2][2] == '-':
return 0, 2
elif board[1][1] == '-' and board[1][0] == '-' and board[2][1] == '-':
return 1, 1
# Check all cross corners (first check for double fork opp using the corners array)
elif corners.count('-') == 1 and corners.count('O') == 2:
return 1, 2
elif board[0][0] == 'O' and board[2][2] == 'O':
if board[1][0] == '-' and board[2][1] == '-' and board[2][0] == '-':
return 2, 0
elif board[0][1] == '-' and board[1][2] == '-' and board[0][2] == '-':
return 0, 2
elif board[2][0] == 'O' and board[0][2] == 'O':
if board[2][1] == '-' and board[1][2] == '-' and board[2][2] == '-':
return 2, 2
elif board[1][0] == '-' and board[0][1] == '-' and board[0][0] == '-':
return 0, 0
return None
def center(self, board):
""" Pick the center if it is available. """
if board[1][1] == '-':
return 1, 1
return None
def corner(self, board):
""" Pick a corner move. """
# Pick opposite corner of opponent if available
if board[0][0] == 'O' and board[2][2] == '-':
return 2, 2
elif board[2][0] == 'O' and board[0][2] == '-':
return 0, 2
elif board[0][2] == 'O' and board[2][0] == '-':
return 2, 0
elif board[2][2] == 'O' and board[0][0] == '-':
return 0, 0
# Pick any corner if no opposites are available
elif board[0][0] == '-':
return 0, 0
elif board[2][0] == '-':
return 2, 0
elif board[0][2] == '-':
return 0, 2
elif board[2][2] == '-':
return 2, 2
return None
def sideEmpty(self, board):
""" Pick an empty side. """
if board[1][0] == '-':
return 1, 0
elif board[2][1] == '-':
return 2, 1
elif board[1][2] == '-':
return 1, 2
elif board[0][1] == '-':
return 0, 1
return None
def randomMove(self, board):
""" Chose a random move from the available options. """
possibles = []
for i in range(3):
for j in range(3):
if board[i][j] == '-':
possibles += [(i, j)]
return possibles[random.randint(0, len(possibles)-1)]
def makeMove(self, board):
"""
Trainer goes through a hierarchy of moves, making the best move that
is currently available each time. A touple is returned that represents
(row, col).
"""
# Chose randomly with some probability so that the teacher does not always win
if random.random() > self.ability_level:
return self.randomMove(board)
# Follow optimal strategy
a = self.win(board)
if a is not None:
return a
a = self.blockWin(board)
if a is not None:
return a
a = self.fork(board)
if a is not None:
return a
a = self.blockFork(board)
if a is not None:
return a
a = self.center(board)
if a is not None:
return a
a = self.corner(board)
if a is not None:
return a
a = self.sideEmpty(board)
if a is not None:
return a
return...
SOLUTION.PDF

Answer To This Question Is Available To Download

Related Questions & Answers

More Questions ยป

Submit New Assignment

Copy and Paste Your Assignment Here