Q-learning-in-Python/enviroment.py

77 lines
2.2 KiB
Python

import random
import pandas as pd
import numpy as np
class Enviroment():
def __init__(self):
self.board = [ 0 for i in range(9) ]
self.bot_symbol = 2
self.user_symbol = 1
# self.bot_action()
def reset(self):
self.board = [ 0 for i in range(9) ]
def show(self):
print("┼───┼───┼───┼")
for i in range(3):
print("", end='')
for j in range(3):
if self.board[ 3*i + j ] == 0:
print(" ", end='')
elif self.board[ 3*i + j ] == 1:
print("", end='')
else:
print("", end='')
print()
print("┼───┼───┼───┼")
print()
def get_available_actions(self):
ans = []
for i in range(9):
if self.board[i] == 0:
ans.append(i)
return ans
def get_winner(self):
paths = [
[0, 1, 2], [3, 4, 5], [6, 7, 8],
[0, 3, 6], [1, 4, 7], [2, 5, 8],
[0, 4, 8], [2, 4, 6]
]
for path in paths:
x, y, z = path
if (self.board[x] == self.board[y]) and (self.board[y] == self.board[z]):
return self.board[x]
return 0
def state_hash(self):
ans = 0
for i in range(9):
ans += self.board[i] * (3**i)
return ans
def bot_action(self):
available_actions = self.get_available_actions()
if len(available_actions) > 0:
loc = random.choice(available_actions)
self.board[loc] = self.bot_symbol
def action(self, loc):
assert loc in self.get_available_actions(), "It's a wrong action"
self.board[loc] = self.user_symbol
winner = self.get_winner() # if != 0: stop
if winner == self.user_symbol:
reward = 1
elif winner == self.bot_symbol:
reward = -1
else:
reward = 0
self.bot_action()
state = self.state_hash()
return state, reward, winner