import random import pandas as pd import numpy as np class Enviroment(): def __init__(self): self.board = [ 0 for i in range(9) ] self.bot_symbol = 2 self.user_symbol = 1 # self.bot_action() def reset(self): self.board = [ 0 for i in range(9) ] def show(self): print("┼───┼───┼───┼") for i in range(3): print("│ ", end='') for j in range(3): if self.board[ 3*i + j ] == 0: print(" ", end=' │ ') elif self.board[ 3*i + j ] == 1: print("○", end=' │ ') else: print("✕", end=' │ ') print() print("┼───┼───┼───┼") print() def get_available_actions(self): ans = [] for i in range(9): if self.board[i] == 0: ans.append(i) return ans def get_winner(self): paths = [ [0, 1, 2], [3, 4, 5], [6, 7, 8], [0, 3, 6], [1, 4, 7], [2, 5, 8], [0, 4, 8], [2, 4, 6] ] for path in paths: x, y, z = path if (self.board[x] == self.board[y]) and (self.board[y] == self.board[z]): return self.board[x] return 0 def state_hash(self): ans = 0 for i in range(9): ans += self.board[i] * (3**i) return ans def bot_action(self): available_actions = self.get_available_actions() if len(available_actions) > 0: loc = random.choice(available_actions) self.board[loc] = self.bot_symbol def action(self, loc): assert loc in self.get_available_actions(), "It's a wrong action" self.board[loc] = self.user_symbol winner = self.get_winner() # if != 0: stop if winner == self.user_symbol: reward = 1 elif winner == self.bot_symbol: reward = -1 else: reward = 0 self.bot_action() state = self.state_hash() return state, reward, winner