From 32a88ac9881c994662e647e7efccf4864317e378 Mon Sep 17 00:00:00 2001
From: snsd0805 <levi900227@gmail.com>
Date: Tue, 30 May 2023 02:18:33 +0800
Subject: [PATCH] feat: tic tac toe simulation OK

---
 main.c | 256 ++++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 218 insertions(+), 38 deletions(-)

diff --git a/main.c b/main.c
index 3fffd9c..4b0cd9b 100644
--- a/main.c
+++ b/main.c
@@ -3,6 +3,9 @@
 #include <limits.h>
 #include <time.h>
 #include <stdlib.h>
+#include <stdbool.h>
+#include <float.h>
+#include <assert.h>
 
 #define BOT_SYMBOL 1
 #define OPPONENT_SYMBOL 2
@@ -13,6 +16,8 @@
 
 #define STATE_NUM 19683
 #define ACTION_NUM 9
+#define EPISODE_NUM 100000
+#define FIRST true
 
 short PATHS[8][3] = {
 	{0, 1, 2}, {3, 4, 5}, {6, 7, 8},
@@ -120,9 +125,6 @@ int state_hash(short *board){
 	int base, hash = 0;
 	for (int i=0; i<9; i++){
 		base = pow(3, i);
-		if (board[i]!=0){
-			printf("%d * %d + ", base, board[i]);
-		}
 		hash += (base * board[i]);
 	}
 	return hash;
@@ -136,22 +138,31 @@ int state_hash(short *board){
 		- short *board (array's address): chessboards' status
 		- struct action *a (a action's pointer): include player & choose loc
 		- int *state (pointer): for return. To save the chessboard's state hash which after doing this action
-		- int *reward (pointer): for return. To save the number of rewards which the player gets after doing this action.
+		- float *reward (pointer): for return. To save the number of rewards which the player gets after doing this action.
+		- float *opponent_reward (pointer): for return. To save the number of rewards which the opponents gets after the player doing this action.
 		- short *winner (pointer): for return. To save the winner in this action. If haven't finish, it will be zero.
 
 	Results:
 		- None. Save in state & reward & winner
 */
-void act(short *board, struct action *a, int *state, int *reward, short *winner){
+void act(short *board, struct action *a, int *state, float *reward, float *opponent_reward, short *winner){
+    // printf("Act( player=%d, action=%d )\n", a->player, a->loc);
+    assert(board[a->loc] == 0);
 	board[a->loc] = a->player;
 	*winner = get_winner(board);
 	*state = state_hash(board);
-	if (*winner == a->player)
-		*reward = 1;
-	else if(*winner != 0)
-		*reward = -1;
-	else
+	if (*winner == a->player){
+		*reward = 1.0;
+        *opponent_reward = -1.0;
+    }
+	else if(*winner != 0){
+		*reward = -1.0;
+        *opponent_reward = 1.0;
+    }
+	else{
 		*reward = 0;
+        *opponent_reward = 0;
+    }
 }
 
 /*
@@ -164,7 +175,7 @@ void act(short *board, struct action *a, int *state, int *reward, short *winner)
 	Results:
 		- short index (integer): the index with the max value
 */
-short argmax(short *arr, short length){
+short short_argmax(short *arr, short length){
 	short ans = -1, max = SHRT_MIN;
 	for (short i=0; i<length; i++){
 		if (arr[i] > max){
@@ -175,6 +186,28 @@ short argmax(short *arr, short length){
 	return ans;
 }
 
+/*
+	Return the index with the max value in the array
+
+	Args:
+		- float *arr (array's address)
+		- short length (integer): array's length
+
+	Results:
+		- short index (integer): the index with the max value
+*/
+short float_argmax(float *arr, short length){
+	float ans = -1, max = -FLT_MAX;
+	for (short i=0; i<length; i++){
+		if (arr[i] > max){
+			max = arr[i];
+			ans = i;
+		}
+	}
+	return ans;
+}
+
+
 /*
 	Choose the next action with Epsilon-Greedy.
 	EPSILON means the probability to choose the best action in this state from Q-Table.
@@ -188,64 +221,211 @@ short argmax(short *arr, short length){
 	Results:
 		- short best_choice
 */
-short bot_choose_action(short *table, short *board, int state){
+short bot_choose_action(float *table, short *board, int state){
 
 	// get available actions for choosing
 	short available_actions[9];
 	short available_actions_length;
-	get_available_actions(board, available_actions, available_actions_length);
+	get_available_actions(board, available_actions, &available_actions_length);
 
 	// use argmax() to find the best choise,
 	// first we should build an available_actions_state array for saving the state for all available choise.
-	short available_actions_state[9];
+	float available_actions_state[9];
 	short available_actions_state_index[9];
 	short available_actions_state_length, index = 0;
 	short temp_index, best_choice;
+	bool zeros = true;
 	for (short i=0; i<available_actions_length; i++){
 		temp_index = available_actions[i];
-		available_actions_state[index] = table[state * 9 + temp_index ];
+		available_actions_state[index] = *(table + state * ACTION_NUM + temp_index);
+		if (available_actions_state[index] != 0.0){
+			zeros = false;
+		}
 		available_actions_state_index[index] = temp_index;
 		index++;
 	}
-	best_choice = argmax(available_actions_state, index);
+	best_choice = float_argmax(available_actions_state, index);
 	best_choice = available_actions_state_index[best_choice];
 
 	// Epsilon-Greedy
 	// If random number > EPSILON   ->   random a action
 	// If random number < EPSILON   ->   choose the best action in this state.
 	double random_num = (double) rand() / (RAND_MAX + 1.0);
-	if (random_num > EPSILON){
-		best_choice = rand() % ACTION_NUM;
+	if ((random_num > EPSILON) || zeros){
+		best_choice = available_actions_state_index[ rand() % index ];
 	}
 
     return best_choice;
 }
 
-int main(){
-	srand(time(NULL));
-	short board[9]= {0};			// tic tac toe's chessboard
+/*
+    Opponent random choose a action to do.
+
+    Args:
+		- short *table (array's address): state table for Q-Learning
+		- short *board (array's address): chessboards' status
+		- int state (integer, state hash): hash for board's status
+
+	Results:
+		- short choice (integer): random, -1 means no available action to choose
+*/
+short opponent_random_action(float *table, short *board, int state){
+
+    // get available actions for choosing
+    short available_actions[9];
+    short available_action_length;
+    get_available_actions(board, available_actions, &available_action_length);
+
+    if (available_action_length == 0){
+        return -1;
+    }
+
+    // random
+    short choice;
+    choice = (short)( rand() % available_action_length );
+    choice = available_actions[choice];
+
+    return choice;
+}
+
+/*
+    Inilialize the Q-Table
+
+    Args:
+        - float *table (two-dim array's start address)
+
+    Results:
+        - None.
+*/
+void init_table(float *table){
+    for (int i=0; i<STATE_NUM; i++){
+        for (int j=0; j<ACTION_NUM; j++){
+            *(table + i * ACTION_NUM + j) = 0;
+        }
+    }
+}
+
+/*
+    Give the chessboard & state, it will return the max reward with the best choice
+
+    Args:
+        - float *table (2-dim array's start address)
+        - short *board (1-dim array's start address): chessboard's address
+        - int state (integer): board state's hash
+
+    Results:
+        - int max_reward
+*/
+float get_estimate_reward(float *table, short *board, int state){
+    short available_actions[9];
+    short available_action_length;
+    get_available_actions(board, available_actions, &available_action_length);
+
+	float available_actions_state[9];
+    for (short i=0; i<available_action_length; i++){
+        available_actions_state[i] = *(table + state * ACTION_NUM + available_actions[i]);   // table[state][available_actions[i]]
+    }
+
+    short ans_index;
+    ans_index = float_argmax(available_actions_state, available_action_length);
+    return available_actions_state[ans_index];
+}
+
+/*
+    Run Q-learning Evaluation or Training.
+
+    Args:
+        - float *table (2-dim array's start address)
+        - short *board (1-dim array's start address): chessboard's address
+        - bool train: train or not
+        - int times: how many episode to simulate
+        - bool plot: whether to plot the gaming process
+
+    Results:
+        - None
+*/
+void run(float *table, short *board, bool train, int times, bool plot){
 	short available_actions[9];
 	short available_actions_length;
 	short winner;
+    short choice, opponent_choice;
+    int state, _state;
+    float estimate_r, estimate_r_, real_r, r, opponent_r;
+    struct action a;
 
-	board[1] = 2;
-	board[3] = 1;
-	board[4] = 1;
-	// board[5] = 2;
-	// board[7] = 2;
+    int win = 0;
 
-	show(board);
-	winner = get_winner(board);
-	printf("winner: %d\n", winner);
+    for (int episode=0; episode<times; episode++){
+        reset(board);
+        state = state_hash(board);
+        while (1){
+            // bot choose the action
+            choice = bot_choose_action(table, board, state);
+            a.loc = choice;
+            a.player = BOT_SYMBOL;
 
-	get_available_actions(board, available_actions, &available_actions_length);
-	for (short i=0; i<available_actions_length; i++){
-		printf("%d ", available_actions[i]);
-	}
-	printf("\n");
+            estimate_r = *(table + state * ACTION_NUM + choice);
+            act(board, &a, &_state, &r, &opponent_r, &winner);
+            if (plot) show(board);
+
+            // opponent random
+            if (winner == 0){
+                opponent_choice = opponent_random_action(table, board, state_hash(board));
+                if (opponent_choice != -1){
+                    a.loc = opponent_choice;
+                    a.player = OPPONENT_SYMBOL;
+                    act(board, &a, &_state, &opponent_r, &r, &winner);
+                    if (plot) show(board);
+                }
+            }
+            get_available_actions(board, available_actions, &available_actions_length);
+
+            if ((winner != 0) || (available_actions_length == 0)){
+                if (plot){
+                    printf("winner: %d, reward: %f, oppo reward: %f\n", winner, r, opponent_r);
+                    printf("==========================================================\n");
+                }
+                real_r = r;
+            } else {
+                estimate_r_ = get_estimate_reward(table, board, _state);
+                real_r = r + LAMBDA * estimate_r_;
+            }
+            if (train){
+                // printf("update");
+                *(table + state * ACTION_NUM + choice) += ( LR * (real_r - estimate_r) );       // table[state][choice] += LR * (real_r - estimate_r)
+            }
+            state = _state;
+
+            if ((winner != 0) || (available_actions_length == 0)){
+                // printf("break\n");
+                if (winner == 1){
+                    win += 1;
+                }
+                break;
+            }
+        }
+    }
+
+    if (!train)
+        printf("%d/%d, %f\%\n", win, 10000, (float)win/10000);
+}
+
+int main(){
+	short board[9]= {0};			// tic tac toe's chessboard
+    float table[STATE_NUM][ACTION_NUM];
+	short available_actions[9];
+	short available_actions_length;
+	short winner;
+    short choice, opponent_choice;
+    int state, _state;
+    int estimate_r, estimate_r_, real_r, r, opponent_r;
+    struct action a;
+
+	srand(time(NULL));
+    init_table(&table[0][0]);
+
+    run(&table[0][0], board, false, 10000, false);
+    run(&table[0][0], board, true, 10000000, false);
+    run(&table[0][0], board, false, 10000, false);
 
-	int hash = state_hash(board);
-	printf("hash: %d", hash);
-	// reset(board);
-	// show(board);
 }