From 32a88ac9881c994662e647e7efccf4864317e378 Mon Sep 17 00:00:00 2001 From: snsd0805 Date: Tue, 30 May 2023 02:18:33 +0800 Subject: [PATCH] feat: tic tac toe simulation OK --- main.c | 256 ++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 218 insertions(+), 38 deletions(-) diff --git a/main.c b/main.c index 3fffd9c..4b0cd9b 100644 --- a/main.c +++ b/main.c @@ -3,6 +3,9 @@ #include #include #include +#include +#include +#include #define BOT_SYMBOL 1 #define OPPONENT_SYMBOL 2 @@ -13,6 +16,8 @@ #define STATE_NUM 19683 #define ACTION_NUM 9 +#define EPISODE_NUM 100000 +#define FIRST true short PATHS[8][3] = { {0, 1, 2}, {3, 4, 5}, {6, 7, 8}, @@ -120,9 +125,6 @@ int state_hash(short *board){ int base, hash = 0; for (int i=0; i<9; i++){ base = pow(3, i); - if (board[i]!=0){ - printf("%d * %d + ", base, board[i]); - } hash += (base * board[i]); } return hash; @@ -136,22 +138,31 @@ int state_hash(short *board){ - short *board (array's address): chessboards' status - struct action *a (a action's pointer): include player & choose loc - int *state (pointer): for return. To save the chessboard's state hash which after doing this action - - int *reward (pointer): for return. To save the number of rewards which the player gets after doing this action. + - float *reward (pointer): for return. To save the number of rewards which the player gets after doing this action. + - float *opponent_reward (pointer): for return. To save the number of rewards which the opponents gets after the player doing this action. - short *winner (pointer): for return. To save the winner in this action. If haven't finish, it will be zero. Results: - None. Save in state & reward & winner */ -void act(short *board, struct action *a, int *state, int *reward, short *winner){ +void act(short *board, struct action *a, int *state, float *reward, float *opponent_reward, short *winner){ + // printf("Act( player=%d, action=%d )\n", a->player, a->loc); + assert(board[a->loc] == 0); board[a->loc] = a->player; *winner = get_winner(board); *state = state_hash(board); - if (*winner == a->player) - *reward = 1; - else if(*winner != 0) - *reward = -1; - else + if (*winner == a->player){ + *reward = 1.0; + *opponent_reward = -1.0; + } + else if(*winner != 0){ + *reward = -1.0; + *opponent_reward = 1.0; + } + else{ *reward = 0; + *opponent_reward = 0; + } } /* @@ -164,7 +175,7 @@ void act(short *board, struct action *a, int *state, int *reward, short *winner) Results: - short index (integer): the index with the max value */ -short argmax(short *arr, short length){ +short short_argmax(short *arr, short length){ short ans = -1, max = SHRT_MIN; for (short i=0; i max){ @@ -175,6 +186,28 @@ short argmax(short *arr, short length){ return ans; } +/* + Return the index with the max value in the array + + Args: + - float *arr (array's address) + - short length (integer): array's length + + Results: + - short index (integer): the index with the max value +*/ +short float_argmax(float *arr, short length){ + float ans = -1, max = -FLT_MAX; + for (short i=0; i max){ + max = arr[i]; + ans = i; + } + } + return ans; +} + + /* Choose the next action with Epsilon-Greedy. EPSILON means the probability to choose the best action in this state from Q-Table. @@ -188,64 +221,211 @@ short argmax(short *arr, short length){ Results: - short best_choice */ -short bot_choose_action(short *table, short *board, int state){ +short bot_choose_action(float *table, short *board, int state){ // get available actions for choosing short available_actions[9]; short available_actions_length; - get_available_actions(board, available_actions, available_actions_length); + get_available_actions(board, available_actions, &available_actions_length); // use argmax() to find the best choise, // first we should build an available_actions_state array for saving the state for all available choise. - short available_actions_state[9]; + float available_actions_state[9]; short available_actions_state_index[9]; short available_actions_state_length, index = 0; short temp_index, best_choice; + bool zeros = true; for (short i=0; i EPSILON -> random a action // If random number < EPSILON -> choose the best action in this state. double random_num = (double) rand() / (RAND_MAX + 1.0); - if (random_num > EPSILON){ - best_choice = rand() % ACTION_NUM; + if ((random_num > EPSILON) || zeros){ + best_choice = available_actions_state_index[ rand() % index ]; } return best_choice; } -int main(){ - srand(time(NULL)); - short board[9]= {0}; // tic tac toe's chessboard +/* + Opponent random choose a action to do. + + Args: + - short *table (array's address): state table for Q-Learning + - short *board (array's address): chessboards' status + - int state (integer, state hash): hash for board's status + + Results: + - short choice (integer): random, -1 means no available action to choose +*/ +short opponent_random_action(float *table, short *board, int state){ + + // get available actions for choosing + short available_actions[9]; + short available_action_length; + get_available_actions(board, available_actions, &available_action_length); + + if (available_action_length == 0){ + return -1; + } + + // random + short choice; + choice = (short)( rand() % available_action_length ); + choice = available_actions[choice]; + + return choice; +} + +/* + Inilialize the Q-Table + + Args: + - float *table (two-dim array's start address) + + Results: + - None. +*/ +void init_table(float *table){ + for (int i=0; i