Q-learning-in-Python/enviroment.py

import random
import pandas as pd
import numpy as np

class Enviroment():
    def __init__(self):
        self.board = [ 0 for i in range(9) ]

        self.bot_symbol = 2
        self.user_symbol = 1

        # self.bot_action()

    def reset(self):
        self.board = [ 0 for i in range(9) ]

    def show(self):
        print("┼───┼───┼───┼")
        for i in range(3):
            print("│ ", end='')
            for j in range(3):
                if self.board[ 3*i + j ] == 0:
                    print(" ", end=' │ ')
                elif self.board[ 3*i + j ] == 1:
                    print("○", end=' │ ')
                else:
                    print("✕", end=' │ ')
            print()
            print("┼───┼───┼───┼")
        print()

    def get_available_actions(self):
        ans = []
        for i in range(9):
            if self.board[i] == 0:
                ans.append(i)
        return ans

    def get_winner(self):
        paths = [
            [0, 1, 2], [3, 4, 5], [6, 7, 8],
            [0, 3, 6], [1, 4, 7], [2, 5, 8],
            [0, 4, 8], [2, 4, 6]
        ]
        for path in paths:
            x, y, z = path
            if (self.board[x] == self.board[y]) and (self.board[y] == self.board[z]):
                return self.board[x]

        return 0

    def state_hash(self):
        ans = 0
        for i in range(9):
            ans += self.board[i] * (3**i)
        return ans

    def bot_action(self):
        available_actions = self.get_available_actions()
        if len(available_actions) > 0:
            loc = random.choice(available_actions)
            self.board[loc] = self.bot_symbol

    def action(self, loc):
        assert loc in self.get_available_actions(), "It's a wrong action"
        self.board[loc] = self.user_symbol

        winner = self.get_winner()                              # if != 0: stop
        if winner == self.user_symbol:
            reward = 1
        elif winner == self.bot_symbol:
            reward = -1
        else:
            reward = 0
            self.bot_action()
        state = self.state_hash()
        return state, reward, winner