feat: Implement Q-learning in Python

2023-05-19 23:51:41 +08:00 · 2023-05-19 23:51:41 +08:00 · 19443ffdf4
commit 19443ffdf4
2 changed files with 173 additions and 0 deletions
--- a/enviroment.py
+++ b/enviroment.py
@ -0,0 +1,77 @@
 import random
 import pandas as pd
 import numpy as np
 class Enviroment():
    def __init__(self):
        self.board = [ 0 for i in range(9) ]
        self.bot_symbol = 2
        self.user_symbol = 1
        # self.bot_action()
    def reset(self):
        self.board = [ 0 for i in range(9) ]
    def show(self):
        print("┼───┼───┼───┼")
        for i in range(3):
            print("│ ", end='')
            for j in range(3):
                if self.board[ 3*i + j ] == 0:
                    print(" ", end=' │ ')
                elif self.board[ 3*i + j ] == 1:
                    print("○", end=' │ ')
                else:
                    print("✕", end=' │ ')
            print()
            print("┼───┼───┼───┼")
        print()
    def get_available_actions(self):
        ans = []
        for i in range(9):
            if self.board[i] == 0:
                ans.append(i)
        return ans
    def get_winner(self):
        paths = [
            [0, 1, 2], [3, 4, 5], [6, 7, 8],
            [0, 3, 6], [1, 4, 7], [2, 5, 8],
            [0, 4, 8], [2, 4, 6]
        ]
        for path in paths:
            x, y, z = path
            if (self.board[x] == self.board[y]) and (self.board[y] == self.board[z]):
                return self.board[x]
        return 0
    def state_hash(self):
        ans = 0
        for i in range(9):
            ans += self.board[i] * (3**i)
        return ans
    def bot_action(self):
        available_actions = self.get_available_actions()
        if len(available_actions) > 0:
            loc = random.choice(available_actions)
            self.board[loc] = self.bot_symbol
    def action(self, loc):
        assert loc in self.get_available_actions(), "It's a wrong action"
        self.board[loc] = self.user_symbol
        winner = self.get_winner()                              # if != 0: stop
        if winner == self.user_symbol:
            reward = 1
        elif winner == self.bot_symbol:
            reward = -1
        else:
            reward = 0
            self.bot_action()
        state = self.state_hash()
        return state, reward, winner
--- a/main.py
+++ b/main.py
@ -0,0 +1,96 @@
 from enviroment import Enviroment
 import random
 import numpy as np
 env = Enviroment()
 ACTION_NUM = 9
 STATE_NUM = (3**9)
 ACTIONS = range(9)
 EPSILON = 0.9                      # epsilon-greedy
 ALPHA = 0.1                        # learning rate
 LAMBDA = 0.9                       # discount factor
 FRESH_TIME = 0.3
 EPISODE_NUM = 1e4
 print("Training EPISODE_NUM: {}\n".format(EPISODE_NUM))
 def chooseAction(state, q_table, actions):
    state_action = q_table[state]
    random_num = random.random()
    if random_num > EPSILON or sum(state_action)==0:
        return random.choice(actions)
    else:
        available_actions = [ state_action[i] for i in actions ]
        choise = np.argmax(available_actions)
        return actions[choise]
 def getEstimateSReward(env, table, state):
    state_action = table[state]
    actions = env.get_available_actions()
    available_actions = [ state_action[i] for i in actions ]
    reward = np.max(available_actions)
    return reward
 def evaluate(env, table, times):
    counter = 0
    for episode in range(times):
        env.reset()
        S = env.state_hash()
        while 1:
            available_actions = env.get_available_actions()
            action = chooseAction(S, table, available_actions)
            estimate_R = table[S][action]
            S_, R, winner = env.action(action)
            if winner != 0 or len(available_actions) == 1:
                real_R = R
            else:
                real_R = R + LAMBDA * getEstimateSReward(env, table, S_)
            S = S_
            if winner != 0 or len(available_actions) == 1:
                if winner == env.user_symbol:
                    counter += 1
                break
    print("{}/{} winning percentage: {}%\n".format(counter, times, counter/times*100))
 table = [ [ 0 for i in range(ACTION_NUM)] for j in range(STATE_NUM) ]
 table[0][6] = 0.9
 table[0][7] = 1
 env = Enviroment()
 print("Before Q-Learning")
 evaluate(env, table, 10000)
 for episode in range(int(EPISODE_NUM)):
    env.reset()
    S = env.state_hash()
    # print(S)
    while 1:
        available_actions = env.get_available_actions()
        action = chooseAction(S, table, available_actions)
        estimate_R = table[S][action]
        S_, R, winner = env.action(action)
        # env.show()
        if winner != 0 or len(available_actions) == 1:
            real_R = R
        else:
            real_R = R + LAMBDA * getEstimateSReward(env, table, S_)
        table[S][action] += ALPHA * (real_R - estimate_R)
        S = S_
        if winner != 0 or len(available_actions) == 1:
            break
 #         print("\n\n")
 #     print("==============================")
 # print(counter)
 print("After Q-Learning")
 evaluate(env, table, 10000)