From 147fef527e99efb534a19f5402bf2629c9db0c7e Mon Sep 17 00:00:00 2001
From: snsd0805 <levi900227@gmail.com>
Date: Tue, 30 May 2023 02:35:26 +0800
Subject: [PATCH] feat: split code into different header file

---
 Makefile     |   2 +-
 constant.h   |  11 ++
 enviroment.c | 146 ++++++++++++++++++
 enviroment.h |  11 ++
 main.c       | 413 +--------------------------------------------------
 q-learning.c | 232 +++++++++++++++++++++++++++++
 q-learning.h |   6 +
 7 files changed, 411 insertions(+), 410 deletions(-)
 create mode 100644 constant.h
 create mode 100644 enviroment.c
 create mode 100644 enviroment.h
 create mode 100644 q-learning.c
 create mode 100644 q-learning.h

diff --git a/Makefile b/Makefile
index 86972c2..1e5f3c8 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 all: a.out
 a.out: main.c
-	gcc main.c -lm
+	gcc main.c enviroment.c q-learning.c -lm
 run:
 	./a.out
diff --git a/constant.h b/constant.h
new file mode 100644
index 0000000..3c0bd69
--- /dev/null
+++ b/constant.h
@@ -0,0 +1,11 @@
+#define BOT_SYMBOL 1
+#define OPPONENT_SYMBOL 2
+
+#define EPSILON 0.9						// Epsilon-greedy
+#define LR 0.1							// learning rate
+#define LAMBDA 0.9						// discount factor
+
+#define STATE_NUM 19683
+#define ACTION_NUM 9
+#define EPISODE_NUM 100000
+#define FIRST true
diff --git a/enviroment.c b/enviroment.c
new file mode 100644
index 0000000..6f5b721
--- /dev/null
+++ b/enviroment.c
@@ -0,0 +1,146 @@
+#include <stdio.h>
+#include <math.h>
+#include <assert.h>
+#include "constant.h"
+#include "enviroment.h"
+
+short PATHS[8][3] = {
+	{0, 1, 2}, {3, 4, 5}, {6, 7, 8},
+	{0, 3, 6}, {1, 4, 7}, {2, 5, 8},
+	{0, 4, 8}, {2, 4, 6}
+};
+
+/*
+	Reset the game, clear the chessboard.
+
+	Args:
+		- short *board (array's address): chessboard's status
+
+	Results:
+		- None, set all blocks on the chessboard to zero.
+*/
+void reset(short* board){
+	for (short i=0; i<9; i++)
+		board[i] = 0;
+}
+
+/*
+	Print the chessboard on the console.
+
+	Args:
+		- short *board (array's address): chessboard's status
+
+	Results:
+		- None. Only printing.
+*/
+void show(short *board){
+	short loc;
+	printf("┼───┼───┼───┼\n");
+	for (short i=0; i<3; i++){
+		printf("│ ");
+		for (short j=0; j<3; j++){
+			loc = 3*i+j;
+			if (board[loc] == 0)
+				printf("  │ ");
+			else if (board[loc] == BOT_SYMBOL)
+				printf("○ │ ");
+			else
+				printf("✕ │ ");
+		}
+		printf("\n");
+		printf("┼───┼───┼───┼\n");
+	}
+	printf("\n\n");
+}
+
+/*
+	Save all available actions into the "result" array.
+
+	Args:
+		- short *board (array's address): chessboard's status
+		- short *result (array's address): To save all available actions.
+		- short *length (integer's pointer): To save the number of available actions.
+
+	Results:
+		- None. All available actions are saved into "result" and the number of actions is saved in "length"
+*/
+void get_available_actions(short *board, short *result, short *length){
+	short index = 0;
+	for (int i=0; i<9; i++)
+		if (board[i] == 0)
+			result[index++] = i;
+	*length = index;
+}
+
+/*
+	Return winner's number;
+
+	Args:
+		- short *board (array's address): chessboard's status
+
+	Results:
+		- short winner_number(integer): winner's number, 0 for no winner now, 1 for Bot, 2 for opponent
+*/
+short get_winner(short *board){
+	int a, b, c;
+	for (int i=0; i<8; i++){
+		a = PATHS[i][0]; b = PATHS[i][1]; c = PATHS[i][2];
+		if ((board[a] == board[b]) && (board[b] == board[c]) && (board[a] != 0)){
+			return board[a];
+		}
+	}
+	return 0;
+}
+
+/*
+	Hash chesstable's status into hash.
+
+	Args:
+		- short *board (array's address): chessboard's status
+
+	Results:
+		- int hash (integer): chessboard's status in i-th block * pow(3, i)
+*/
+int state_hash(short *board){
+	int base, hash = 0;
+	for (int i=0; i<9; i++){
+		base = pow(3, i);
+		hash += (base * board[i]);
+	}
+	return hash;
+}
+
+
+/*
+	Act on the chessboard.
+
+	Args:
+		- short *board (array's address): chessboards' status
+		- struct action *a (a action's pointer): include player & choose loc
+		- int *state (pointer): for return. To save the chessboard's state hash which after doing this action
+		- float *reward (pointer): for return. To save the number of rewards which the player gets after doing this action.
+		- float *opponent_reward (pointer): for return. To save the number of rewards which the opponents gets after the player doing this action.
+		- short *winner (pointer): for return. To save the winner in this action. If haven't finish, it will be zero.
+
+	Results:
+		- None. Save in state & reward & winner
+*/
+void act(short *board, struct action *a, int *state, float *reward, float *opponent_reward, short *winner){
+    // printf("Act( player=%d, action=%d )\n", a->player, a->loc);
+    assert(board[a->loc] == 0);
+	board[a->loc] = a->player;
+	*winner = get_winner(board);
+	*state = state_hash(board);
+	if (*winner == a->player){
+		*reward = 1.0;
+        *opponent_reward = -1.0;
+    }
+	else if(*winner != 0){
+		*reward = -1.0;
+        *opponent_reward = 1.0;
+    }
+	else{
+		*reward = 0;
+        *opponent_reward = 0;
+    }
+}
diff --git a/enviroment.h b/enviroment.h
new file mode 100644
index 0000000..4a27df6
--- /dev/null
+++ b/enviroment.h
@@ -0,0 +1,11 @@
+struct action{
+	short player;
+	short loc;
+};
+
+void reset(short* board);
+void show(short *board);
+void get_available_actions(short *board, short *result, short *length);
+short get_winner(short *board);
+int state_hash(short *board);
+void act(short *board, struct action *a, int *state, float *reward, float *opponent_reward, short *winner);
diff --git a/main.c b/main.c
index 4b0cd9b..36a7fd0 100644
--- a/main.c
+++ b/main.c
@@ -1,414 +1,10 @@
 #include <stdio.h>
-#include <math.h>
-#include <limits.h>
 #include <time.h>
 #include <stdlib.h>
 #include <stdbool.h>
-#include <float.h>
-#include <assert.h>
-
-#define BOT_SYMBOL 1
-#define OPPONENT_SYMBOL 2
-
-#define EPSILON 0.9						// Epsilon-greedy
-#define LR 0.1							// learning rate
-#define LAMBDA 0.9						// discount factor
-
-#define STATE_NUM 19683
-#define ACTION_NUM 9
-#define EPISODE_NUM 100000
-#define FIRST true
-
-short PATHS[8][3] = {
-	{0, 1, 2}, {3, 4, 5}, {6, 7, 8},
-	{0, 3, 6}, {1, 4, 7}, {2, 5, 8},
-	{0, 4, 8}, {2, 4, 6}
-};
-
-struct action{
-	short player;
-	short loc;
-};
-
-/*
-	Reset the game, clear the chessboard.
-
-	Args:
-		- short *board (array's address): chessboard's status
-
-	Results:
-		- None, set all blocks on the chessboard to zero.
-*/
-void reset(short* board){
-	for (short i=0; i<9; i++)
-		board[i] = 0;
-}
-
-/*
-	Print the chessboard on the console.
-
-	Args:
-		- short *board (array's address): chessboard's status
-
-	Results:
-		- None. Only printing.
-*/
-void show(short *board){
-	short loc;
-	printf("┼───┼───┼───┼\n");
-	for (short i=0; i<3; i++){
-		printf("│ ");
-		for (short j=0; j<3; j++){
-			loc = 3*i+j;
-			if (board[loc] == 0)
-				printf("  │ ");
-			else if (board[loc] == BOT_SYMBOL)
-				printf("○ │ ");
-			else
-				printf("✕ │ ");
-		}
-		printf("\n");
-		printf("┼───┼───┼───┼\n");
-	}
-	printf("\n\n");
-}
-
-/*
-	Save all available actions into the "result" array.
-
-	Args:
-		- short *board (array's address): chessboard's status
-		- short *result (array's address): To save all available actions.
-		- short *length (integer's pointer): To save the number of available actions.
-
-	Results:
-		- None. All available actions are saved into "result" and the number of actions is saved in "length"
-*/
-void get_available_actions(short *board, short *result, short *length){
-	short index = 0;
-	for (int i=0; i<9; i++)
-		if (board[i] == 0)
-			result[index++] = i;
-	*length = index;
-}
-
-/*
-	Return winner's number;
-
-	Args:
-		- short *board (array's address): chessboard's status
-
-	Results:
-		- short winner_number(integer): winner's number, 0 for no winner now, 1 for Bot, 2 for opponent
-*/
-short get_winner(short *board){
-	int a, b, c;
-	for (int i=0; i<8; i++){
-		a = PATHS[i][0]; b = PATHS[i][1]; c = PATHS[i][2];
-		if ((board[a] == board[b]) && (board[b] == board[c]) && (board[a] != 0)){
-			return board[a];
-		}
-	}
-	return 0;
-}
-
-/*
-	Hash chesstable's status into hash.
-
-	Args:
-		- short *board (array's address): chessboard's status
-
-	Results:
-		- int hash (integer): chessboard's status in i-th block * pow(3, i)
-*/
-int state_hash(short *board){
-	int base, hash = 0;
-	for (int i=0; i<9; i++){
-		base = pow(3, i);
-		hash += (base * board[i]);
-	}
-	return hash;
-}
-
-
-/*
-	Act on the chessboard.
-
-	Args:
-		- short *board (array's address): chessboards' status
-		- struct action *a (a action's pointer): include player & choose loc
-		- int *state (pointer): for return. To save the chessboard's state hash which after doing this action
-		- float *reward (pointer): for return. To save the number of rewards which the player gets after doing this action.
-		- float *opponent_reward (pointer): for return. To save the number of rewards which the opponents gets after the player doing this action.
-		- short *winner (pointer): for return. To save the winner in this action. If haven't finish, it will be zero.
-
-	Results:
-		- None. Save in state & reward & winner
-*/
-void act(short *board, struct action *a, int *state, float *reward, float *opponent_reward, short *winner){
-    // printf("Act( player=%d, action=%d )\n", a->player, a->loc);
-    assert(board[a->loc] == 0);
-	board[a->loc] = a->player;
-	*winner = get_winner(board);
-	*state = state_hash(board);
-	if (*winner == a->player){
-		*reward = 1.0;
-        *opponent_reward = -1.0;
-    }
-	else if(*winner != 0){
-		*reward = -1.0;
-        *opponent_reward = 1.0;
-    }
-	else{
-		*reward = 0;
-        *opponent_reward = 0;
-    }
-}
-
-/*
-	Return the index with the max value in the array
-
-	Args:
-		- short *arr (array's address)
-		- short length (integer): array's length
-
-	Results:
-		- short index (integer): the index with the max value
-*/
-short short_argmax(short *arr, short length){
-	short ans = -1, max = SHRT_MIN;
-	for (short i=0; i<length; i++){
-		if (arr[i] > max){
-			max = arr[i];
-			ans = i;
-		}
-	}
-	return ans;
-}
-
-/*
-	Return the index with the max value in the array
-
-	Args:
-		- float *arr (array's address)
-		- short length (integer): array's length
-
-	Results:
-		- short index (integer): the index with the max value
-*/
-short float_argmax(float *arr, short length){
-	float ans = -1, max = -FLT_MAX;
-	for (short i=0; i<length; i++){
-		if (arr[i] > max){
-			max = arr[i];
-			ans = i;
-		}
-	}
-	return ans;
-}
-
-
-/*
-	Choose the next action with Epsilon-Greedy.
-	EPSILON means the probability to choose the best action in this state from Q-Table.
-	(1-EPSILON) to random an action to do.
-
-	Args:
-		- short *table (array's address): state table for Q-Learning
-		- short *board (array's address): chessboards' status
-		- int state (integer, state hash): hash for board's status
-
-	Results:
-		- short best_choice
-*/
-short bot_choose_action(float *table, short *board, int state){
-
-	// get available actions for choosing
-	short available_actions[9];
-	short available_actions_length;
-	get_available_actions(board, available_actions, &available_actions_length);
-
-	// use argmax() to find the best choise,
-	// first we should build an available_actions_state array for saving the state for all available choise.
-	float available_actions_state[9];
-	short available_actions_state_index[9];
-	short available_actions_state_length, index = 0;
-	short temp_index, best_choice;
-	bool zeros = true;
-	for (short i=0; i<available_actions_length; i++){
-		temp_index = available_actions[i];
-		available_actions_state[index] = *(table + state * ACTION_NUM + temp_index);
-		if (available_actions_state[index] != 0.0){
-			zeros = false;
-		}
-		available_actions_state_index[index] = temp_index;
-		index++;
-	}
-	best_choice = float_argmax(available_actions_state, index);
-	best_choice = available_actions_state_index[best_choice];
-
-	// Epsilon-Greedy
-	// If random number > EPSILON   ->   random a action
-	// If random number < EPSILON   ->   choose the best action in this state.
-	double random_num = (double) rand() / (RAND_MAX + 1.0);
-	if ((random_num > EPSILON) || zeros){
-		best_choice = available_actions_state_index[ rand() % index ];
-	}
-
-    return best_choice;
-}
-
-/*
-    Opponent random choose a action to do.
-
-    Args:
-		- short *table (array's address): state table for Q-Learning
-		- short *board (array's address): chessboards' status
-		- int state (integer, state hash): hash for board's status
-
-	Results:
-		- short choice (integer): random, -1 means no available action to choose
-*/
-short opponent_random_action(float *table, short *board, int state){
-
-    // get available actions for choosing
-    short available_actions[9];
-    short available_action_length;
-    get_available_actions(board, available_actions, &available_action_length);
-
-    if (available_action_length == 0){
-        return -1;
-    }
-
-    // random
-    short choice;
-    choice = (short)( rand() % available_action_length );
-    choice = available_actions[choice];
-
-    return choice;
-}
-
-/*
-    Inilialize the Q-Table
-
-    Args:
-        - float *table (two-dim array's start address)
-
-    Results:
-        - None.
-*/
-void init_table(float *table){
-    for (int i=0; i<STATE_NUM; i++){
-        for (int j=0; j<ACTION_NUM; j++){
-            *(table + i * ACTION_NUM + j) = 0;
-        }
-    }
-}
-
-/*
-    Give the chessboard & state, it will return the max reward with the best choice
-
-    Args:
-        - float *table (2-dim array's start address)
-        - short *board (1-dim array's start address): chessboard's address
-        - int state (integer): board state's hash
-
-    Results:
-        - int max_reward
-*/
-float get_estimate_reward(float *table, short *board, int state){
-    short available_actions[9];
-    short available_action_length;
-    get_available_actions(board, available_actions, &available_action_length);
-
-	float available_actions_state[9];
-    for (short i=0; i<available_action_length; i++){
-        available_actions_state[i] = *(table + state * ACTION_NUM + available_actions[i]);   // table[state][available_actions[i]]
-    }
-
-    short ans_index;
-    ans_index = float_argmax(available_actions_state, available_action_length);
-    return available_actions_state[ans_index];
-}
-
-/*
-    Run Q-learning Evaluation or Training.
-
-    Args:
-        - float *table (2-dim array's start address)
-        - short *board (1-dim array's start address): chessboard's address
-        - bool train: train or not
-        - int times: how many episode to simulate
-        - bool plot: whether to plot the gaming process
-
-    Results:
-        - None
-*/
-void run(float *table, short *board, bool train, int times, bool plot){
-	short available_actions[9];
-	short available_actions_length;
-	short winner;
-    short choice, opponent_choice;
-    int state, _state;
-    float estimate_r, estimate_r_, real_r, r, opponent_r;
-    struct action a;
-
-    int win = 0;
-
-    for (int episode=0; episode<times; episode++){
-        reset(board);
-        state = state_hash(board);
-        while (1){
-            // bot choose the action
-            choice = bot_choose_action(table, board, state);
-            a.loc = choice;
-            a.player = BOT_SYMBOL;
-
-            estimate_r = *(table + state * ACTION_NUM + choice);
-            act(board, &a, &_state, &r, &opponent_r, &winner);
-            if (plot) show(board);
-
-            // opponent random
-            if (winner == 0){
-                opponent_choice = opponent_random_action(table, board, state_hash(board));
-                if (opponent_choice != -1){
-                    a.loc = opponent_choice;
-                    a.player = OPPONENT_SYMBOL;
-                    act(board, &a, &_state, &opponent_r, &r, &winner);
-                    if (plot) show(board);
-                }
-            }
-            get_available_actions(board, available_actions, &available_actions_length);
-
-            if ((winner != 0) || (available_actions_length == 0)){
-                if (plot){
-                    printf("winner: %d, reward: %f, oppo reward: %f\n", winner, r, opponent_r);
-                    printf("==========================================================\n");
-                }
-                real_r = r;
-            } else {
-                estimate_r_ = get_estimate_reward(table, board, _state);
-                real_r = r + LAMBDA * estimate_r_;
-            }
-            if (train){
-                // printf("update");
-                *(table + state * ACTION_NUM + choice) += ( LR * (real_r - estimate_r) );       // table[state][choice] += LR * (real_r - estimate_r)
-            }
-            state = _state;
-
-            if ((winner != 0) || (available_actions_length == 0)){
-                // printf("break\n");
-                if (winner == 1){
-                    win += 1;
-                }
-                break;
-            }
-        }
-    }
-
-    if (!train)
-        printf("%d/%d, %f\%\n", win, 10000, (float)win/10000);
-}
+#include "constant.h"
+#include "enviroment.h"
+#include "q-learning.h"
 
 int main(){
 	short board[9]= {0};			// tic tac toe's chessboard
@@ -425,7 +21,6 @@ int main(){
     init_table(&table[0][0]);
 
     run(&table[0][0], board, false, 10000, false);
-    run(&table[0][0], board, true, 10000000, false);
+    run(&table[0][0], board, true, 1000000, false);
     run(&table[0][0], board, false, 10000, false);
-
 }
diff --git a/q-learning.c b/q-learning.c
new file mode 100644
index 0000000..3470e23
--- /dev/null
+++ b/q-learning.c
@@ -0,0 +1,232 @@
+#include <stdio.h>
+#include <float.h>
+#include <stdbool.h>
+#include <limits.h>
+#include <stdlib.h>
+
+#include "constant.h"
+#include "enviroment.h"
+
+/*
+	Return the index with the max value in the array
+
+	Args:
+		- float *arr (array's address)
+		- short length (integer): array's length
+
+	Results:
+		- short index (integer): the index with the max value
+*/
+short float_argmax(float *arr, short length){
+	float ans = -1, max = -FLT_MAX;
+	for (short i=0; i<length; i++){
+		if (arr[i] > max){
+			max = arr[i];
+			ans = i;
+		}
+	}
+	return ans;
+}
+
+
+/*
+	Choose the next action with Epsilon-Greedy.
+	EPSILON means the probability to choose the best action in this state from Q-Table.
+	(1-EPSILON) to random an action to do.
+
+	Args:
+		- short *table (array's address): state table for Q-Learning
+		- short *board (array's address): chessboards' status
+		- int state (integer, state hash): hash for board's status
+
+	Results:
+		- short best_choice
+*/
+short bot_choose_action(float *table, short *board, int state){
+
+	// get available actions for choosing
+	short available_actions[9];
+	short available_actions_length;
+	get_available_actions(board, available_actions, &available_actions_length);
+
+	// use argmax() to find the best choise,
+	// first we should build an available_actions_state array for saving the state for all available choise.
+	float available_actions_state[9];
+	short available_actions_state_index[9];
+	short available_actions_state_length, index = 0;
+	short temp_index, best_choice;
+	bool zeros = true;
+	for (short i=0; i<available_actions_length; i++){
+		temp_index = available_actions[i];
+		available_actions_state[index] = *(table + state * ACTION_NUM + temp_index);
+		if (available_actions_state[index] != 0.0){
+			zeros = false;
+		}
+		available_actions_state_index[index] = temp_index;
+		index++;
+	}
+	best_choice = float_argmax(available_actions_state, index);
+	best_choice = available_actions_state_index[best_choice];
+
+	// Epsilon-Greedy
+	// If random number > EPSILON   ->   random a action
+	// If random number < EPSILON   ->   choose the best action in this state.
+	double random_num = (double) rand() / (RAND_MAX + 1.0);
+	if ((random_num > EPSILON) || zeros){
+		best_choice = available_actions_state_index[ rand() % index ];
+	}
+
+    return best_choice;
+}
+
+/*
+    Opponent random choose a action to do.
+
+    Args:
+		- short *table (array's address): state table for Q-Learning
+		- short *board (array's address): chessboards' status
+		- int state (integer, state hash): hash for board's status
+
+	Results:
+		- short choice (integer): random, -1 means no available action to choose
+*/
+short opponent_random_action(float *table, short *board, int state){
+
+    // get available actions for choosing
+    short available_actions[9];
+    short available_action_length;
+    get_available_actions(board, available_actions, &available_action_length);
+
+    if (available_action_length == 0){
+        return -1;
+    }
+
+    // random
+    short choice;
+    choice = (short)( rand() % available_action_length );
+    choice = available_actions[choice];
+
+    return choice;
+}
+
+/*
+    Inilialize the Q-Table
+
+    Args:
+        - float *table (two-dim array's start address)
+
+    Results:
+        - None.
+*/
+void init_table(float *table){
+    for (int i=0; i<STATE_NUM; i++){
+        for (int j=0; j<ACTION_NUM; j++){
+            *(table + i * ACTION_NUM + j) = 0;
+        }
+    }
+}
+
+/*
+    Give the chessboard & state, it will return the max reward with the best choice
+
+    Args:
+        - float *table (2-dim array's start address)
+        - short *board (1-dim array's start address): chessboard's address
+        - int state (integer): board state's hash
+
+    Results:
+        - int max_reward
+*/
+float get_estimate_reward(float *table, short *board, int state){
+    short available_actions[9];
+    short available_action_length;
+    get_available_actions(board, available_actions, &available_action_length);
+
+	float available_actions_state[9];
+    for (short i=0; i<available_action_length; i++){
+        available_actions_state[i] = *(table + state * ACTION_NUM + available_actions[i]);   // table[state][available_actions[i]]
+    }
+
+    short ans_index;
+    ans_index = float_argmax(available_actions_state, available_action_length);
+    return available_actions_state[ans_index];
+}
+
+/*
+    Run Q-learning Evaluation or Training.
+
+    Args:
+        - float *table (2-dim array's start address)
+        - short *board (1-dim array's start address): chessboard's address
+        - bool train: train or not
+        - int times: how many episode to simulate
+        - bool plot: whether to plot the gaming process
+
+    Results:
+        - None
+*/
+void run(float *table, short *board, bool train, int times, bool plot){
+	short available_actions[9];
+	short available_actions_length;
+	short winner;
+    short choice, opponent_choice;
+    int state, _state;
+    float estimate_r, estimate_r_, real_r, r, opponent_r;
+    struct action a;
+
+    int win = 0;
+
+    for (int episode=0; episode<times; episode++){
+        reset(board);
+        state = state_hash(board);
+        while (1){
+            // bot choose the action
+            choice = bot_choose_action(table, board, state);
+            a.loc = choice;
+            a.player = BOT_SYMBOL;
+
+            estimate_r = *(table + state * ACTION_NUM + choice);
+            act(board, &a, &_state, &r, &opponent_r, &winner);
+            if (plot) show(board);
+
+            // opponent random
+            if (winner == 0){
+                opponent_choice = opponent_random_action(table, board, state_hash(board));
+                if (opponent_choice != -1){
+                    a.loc = opponent_choice;
+                    a.player = OPPONENT_SYMBOL;
+                    act(board, &a, &_state, &opponent_r, &r, &winner);
+                    if (plot) show(board);
+                }
+            }
+            get_available_actions(board, available_actions, &available_actions_length);
+
+            if ((winner != 0) || (available_actions_length == 0)){
+                if (plot){
+                    printf("winner: %d, reward: %f, oppo reward: %f\n", winner, r, opponent_r);
+                    printf("==========================================================\n");
+                }
+                real_r = r;
+            } else {
+                estimate_r_ = get_estimate_reward(table, board, _state);
+                real_r = r + LAMBDA * estimate_r_;
+            }
+            if (train){
+                // printf("update");
+                *(table + state * ACTION_NUM + choice) += ( LR * (real_r - estimate_r) );       // table[state][choice] += LR * (real_r - estimate_r)
+            }
+            state = _state;
+
+            if ((winner != 0) || (available_actions_length == 0)){
+                // printf("break\n");
+                if (winner == 1){
+                    win += 1;
+                }
+                break;
+            }
+        }
+    }
+
+    if (!train)
+        printf("%d/%d, %f\%\n", win, 10000, (float)win/10000);
+}
diff --git a/q-learning.h b/q-learning.h
new file mode 100644
index 0000000..16c3287
--- /dev/null
+++ b/q-learning.h
@@ -0,0 +1,6 @@
+short float_argmax(float *arr, short length);
+short bot_choose_action(float *table, short *board, int state);
+short opponent_random_action(float *table, short *board, int state);
+void init_table(float *table);
+float get_estimate_reward(float *table, short *board, int state);
+void run(float *table, short *board, bool train, int times, bool plot);
\ No newline at end of file