from enviroment import Enviroment import random import numpy as np env = Enviroment() ACTION_NUM = 9 STATE_NUM = (3**9) ACTIONS = range(9) EPSILON = 0.9 # epsilon-greedy ALPHA = 0.1 # learning rate LAMBDA = 0.9 # discount factor FRESH_TIME = 0.3 EPISODE_NUM = 1e4 print("Training EPISODE_NUM: {}\n".format(EPISODE_NUM)) def chooseAction(state, q_table, actions): state_action = q_table[state] random_num = random.random() if random_num > EPSILON or sum(state_action)==0: return random.choice(actions) else: available_actions = [ state_action[i] for i in actions ] choise = np.argmax(available_actions) return actions[choise] def getEstimateSReward(env, table, state): state_action = table[state] actions = env.get_available_actions() available_actions = [ state_action[i] for i in actions ] reward = np.max(available_actions) return reward def evaluate(env, table, times): counter = 0 for episode in range(times): env.reset() S = env.state_hash() while 1: available_actions = env.get_available_actions() action = chooseAction(S, table, available_actions) estimate_R = table[S][action] S_, R, winner = env.action(action) if winner != 0 or len(available_actions) == 1: real_R = R else: real_R = R + LAMBDA * getEstimateSReward(env, table, S_) S = S_ if winner != 0 or len(available_actions) == 1: if winner == env.user_symbol: counter += 1 break print("{}/{} winning percentage: {}%\n".format(counter, times, counter/times*100)) table = [ [ 0 for i in range(ACTION_NUM)] for j in range(STATE_NUM) ] table[0][6] = 0.9 table[0][7] = 1 env = Enviroment() print("Before Q-Learning") evaluate(env, table, 10000) for episode in range(int(EPISODE_NUM)): env.reset() S = env.state_hash() # print(S) while 1: available_actions = env.get_available_actions() action = chooseAction(S, table, available_actions) estimate_R = table[S][action] S_, R, winner = env.action(action) # env.show() if winner != 0 or len(available_actions) == 1: real_R = R else: real_R = R + LAMBDA * getEstimateSReward(env, table, S_) table[S][action] += ALPHA * (real_R - estimate_R) S = S_ if winner != 0 or len(available_actions) == 1: break # print("\n\n") # print("==============================") # print(counter) print("After Q-Learning") evaluate(env, table, 10000)