432 lines
11 KiB
C
432 lines
11 KiB
C
#include <stdio.h>
|
|
#include <math.h>
|
|
#include <limits.h>
|
|
#include <time.h>
|
|
#include <stdlib.h>
|
|
#include <stdbool.h>
|
|
#include <float.h>
|
|
#include <assert.h>
|
|
|
|
#define BOT_SYMBOL 1
|
|
#define OPPONENT_SYMBOL 2
|
|
|
|
#define EPSILON 0.9 // Epsilon-greedy
|
|
#define LR 0.1 // learning rate
|
|
#define LAMBDA 0.9 // discount factor
|
|
|
|
#define STATE_NUM 19683
|
|
#define ACTION_NUM 9
|
|
#define EPISODE_NUM 100000
|
|
#define FIRST true
|
|
|
|
short PATHS[8][3] = {
|
|
{0, 1, 2}, {3, 4, 5}, {6, 7, 8},
|
|
{0, 3, 6}, {1, 4, 7}, {2, 5, 8},
|
|
{0, 4, 8}, {2, 4, 6}
|
|
};
|
|
|
|
struct action{
|
|
short player;
|
|
short loc;
|
|
};
|
|
|
|
/*
|
|
Reset the game, clear the chessboard.
|
|
|
|
Args:
|
|
- short *board (array's address): chessboard's status
|
|
|
|
Results:
|
|
- None, set all blocks on the chessboard to zero.
|
|
*/
|
|
void reset(short* board){
|
|
for (short i=0; i<9; i++)
|
|
board[i] = 0;
|
|
}
|
|
|
|
/*
|
|
Print the chessboard on the console.
|
|
|
|
Args:
|
|
- short *board (array's address): chessboard's status
|
|
|
|
Results:
|
|
- None. Only printing.
|
|
*/
|
|
void show(short *board){
|
|
short loc;
|
|
printf("┼───┼───┼───┼\n");
|
|
for (short i=0; i<3; i++){
|
|
printf("│ ");
|
|
for (short j=0; j<3; j++){
|
|
loc = 3*i+j;
|
|
if (board[loc] == 0)
|
|
printf(" │ ");
|
|
else if (board[loc] == BOT_SYMBOL)
|
|
printf("○ │ ");
|
|
else
|
|
printf("✕ │ ");
|
|
}
|
|
printf("\n");
|
|
printf("┼───┼───┼───┼\n");
|
|
}
|
|
printf("\n\n");
|
|
}
|
|
|
|
/*
|
|
Save all available actions into the "result" array.
|
|
|
|
Args:
|
|
- short *board (array's address): chessboard's status
|
|
- short *result (array's address): To save all available actions.
|
|
- short *length (integer's pointer): To save the number of available actions.
|
|
|
|
Results:
|
|
- None. All available actions are saved into "result" and the number of actions is saved in "length"
|
|
*/
|
|
void get_available_actions(short *board, short *result, short *length){
|
|
short index = 0;
|
|
for (int i=0; i<9; i++)
|
|
if (board[i] == 0)
|
|
result[index++] = i;
|
|
*length = index;
|
|
}
|
|
|
|
/*
|
|
Return winner's number;
|
|
|
|
Args:
|
|
- short *board (array's address): chessboard's status
|
|
|
|
Results:
|
|
- short winner_number(integer): winner's number, 0 for no winner now, 1 for Bot, 2 for opponent
|
|
*/
|
|
short get_winner(short *board){
|
|
int a, b, c;
|
|
for (int i=0; i<8; i++){
|
|
a = PATHS[i][0]; b = PATHS[i][1]; c = PATHS[i][2];
|
|
if ((board[a] == board[b]) && (board[b] == board[c]) && (board[a] != 0)){
|
|
return board[a];
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
Hash chesstable's status into hash.
|
|
|
|
Args:
|
|
- short *board (array's address): chessboard's status
|
|
|
|
Results:
|
|
- int hash (integer): chessboard's status in i-th block * pow(3, i)
|
|
*/
|
|
int state_hash(short *board){
|
|
int base, hash = 0;
|
|
for (int i=0; i<9; i++){
|
|
base = pow(3, i);
|
|
hash += (base * board[i]);
|
|
}
|
|
return hash;
|
|
}
|
|
|
|
|
|
/*
|
|
Act on the chessboard.
|
|
|
|
Args:
|
|
- short *board (array's address): chessboards' status
|
|
- struct action *a (a action's pointer): include player & choose loc
|
|
- int *state (pointer): for return. To save the chessboard's state hash which after doing this action
|
|
- float *reward (pointer): for return. To save the number of rewards which the player gets after doing this action.
|
|
- float *opponent_reward (pointer): for return. To save the number of rewards which the opponents gets after the player doing this action.
|
|
- short *winner (pointer): for return. To save the winner in this action. If haven't finish, it will be zero.
|
|
|
|
Results:
|
|
- None. Save in state & reward & winner
|
|
*/
|
|
void act(short *board, struct action *a, int *state, float *reward, float *opponent_reward, short *winner){
|
|
// printf("Act( player=%d, action=%d )\n", a->player, a->loc);
|
|
assert(board[a->loc] == 0);
|
|
board[a->loc] = a->player;
|
|
*winner = get_winner(board);
|
|
*state = state_hash(board);
|
|
if (*winner == a->player){
|
|
*reward = 1.0;
|
|
*opponent_reward = -1.0;
|
|
}
|
|
else if(*winner != 0){
|
|
*reward = -1.0;
|
|
*opponent_reward = 1.0;
|
|
}
|
|
else{
|
|
*reward = 0;
|
|
*opponent_reward = 0;
|
|
}
|
|
}
|
|
|
|
/*
|
|
Return the index with the max value in the array
|
|
|
|
Args:
|
|
- short *arr (array's address)
|
|
- short length (integer): array's length
|
|
|
|
Results:
|
|
- short index (integer): the index with the max value
|
|
*/
|
|
short short_argmax(short *arr, short length){
|
|
short ans = -1, max = SHRT_MIN;
|
|
for (short i=0; i<length; i++){
|
|
if (arr[i] > max){
|
|
max = arr[i];
|
|
ans = i;
|
|
}
|
|
}
|
|
return ans;
|
|
}
|
|
|
|
/*
|
|
Return the index with the max value in the array
|
|
|
|
Args:
|
|
- float *arr (array's address)
|
|
- short length (integer): array's length
|
|
|
|
Results:
|
|
- short index (integer): the index with the max value
|
|
*/
|
|
short float_argmax(float *arr, short length){
|
|
float ans = -1, max = -FLT_MAX;
|
|
for (short i=0; i<length; i++){
|
|
if (arr[i] > max){
|
|
max = arr[i];
|
|
ans = i;
|
|
}
|
|
}
|
|
return ans;
|
|
}
|
|
|
|
|
|
/*
|
|
Choose the next action with Epsilon-Greedy.
|
|
EPSILON means the probability to choose the best action in this state from Q-Table.
|
|
(1-EPSILON) to random an action to do.
|
|
|
|
Args:
|
|
- short *table (array's address): state table for Q-Learning
|
|
- short *board (array's address): chessboards' status
|
|
- int state (integer, state hash): hash for board's status
|
|
|
|
Results:
|
|
- short best_choice
|
|
*/
|
|
short bot_choose_action(float *table, short *board, int state){
|
|
|
|
// get available actions for choosing
|
|
short available_actions[9];
|
|
short available_actions_length;
|
|
get_available_actions(board, available_actions, &available_actions_length);
|
|
|
|
// use argmax() to find the best choise,
|
|
// first we should build an available_actions_state array for saving the state for all available choise.
|
|
float available_actions_state[9];
|
|
short available_actions_state_index[9];
|
|
short available_actions_state_length, index = 0;
|
|
short temp_index, best_choice;
|
|
bool zeros = true;
|
|
for (short i=0; i<available_actions_length; i++){
|
|
temp_index = available_actions[i];
|
|
available_actions_state[index] = *(table + state * ACTION_NUM + temp_index);
|
|
if (available_actions_state[index] != 0.0){
|
|
zeros = false;
|
|
}
|
|
available_actions_state_index[index] = temp_index;
|
|
index++;
|
|
}
|
|
best_choice = float_argmax(available_actions_state, index);
|
|
best_choice = available_actions_state_index[best_choice];
|
|
|
|
// Epsilon-Greedy
|
|
// If random number > EPSILON -> random a action
|
|
// If random number < EPSILON -> choose the best action in this state.
|
|
double random_num = (double) rand() / (RAND_MAX + 1.0);
|
|
if ((random_num > EPSILON) || zeros){
|
|
best_choice = available_actions_state_index[ rand() % index ];
|
|
}
|
|
|
|
return best_choice;
|
|
}
|
|
|
|
/*
|
|
Opponent random choose a action to do.
|
|
|
|
Args:
|
|
- short *table (array's address): state table for Q-Learning
|
|
- short *board (array's address): chessboards' status
|
|
- int state (integer, state hash): hash for board's status
|
|
|
|
Results:
|
|
- short choice (integer): random, -1 means no available action to choose
|
|
*/
|
|
short opponent_random_action(float *table, short *board, int state){
|
|
|
|
// get available actions for choosing
|
|
short available_actions[9];
|
|
short available_action_length;
|
|
get_available_actions(board, available_actions, &available_action_length);
|
|
|
|
if (available_action_length == 0){
|
|
return -1;
|
|
}
|
|
|
|
// random
|
|
short choice;
|
|
choice = (short)( rand() % available_action_length );
|
|
choice = available_actions[choice];
|
|
|
|
return choice;
|
|
}
|
|
|
|
/*
|
|
Inilialize the Q-Table
|
|
|
|
Args:
|
|
- float *table (two-dim array's start address)
|
|
|
|
Results:
|
|
- None.
|
|
*/
|
|
void init_table(float *table){
|
|
for (int i=0; i<STATE_NUM; i++){
|
|
for (int j=0; j<ACTION_NUM; j++){
|
|
*(table + i * ACTION_NUM + j) = 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
Give the chessboard & state, it will return the max reward with the best choice
|
|
|
|
Args:
|
|
- float *table (2-dim array's start address)
|
|
- short *board (1-dim array's start address): chessboard's address
|
|
- int state (integer): board state's hash
|
|
|
|
Results:
|
|
- int max_reward
|
|
*/
|
|
float get_estimate_reward(float *table, short *board, int state){
|
|
short available_actions[9];
|
|
short available_action_length;
|
|
get_available_actions(board, available_actions, &available_action_length);
|
|
|
|
float available_actions_state[9];
|
|
for (short i=0; i<available_action_length; i++){
|
|
available_actions_state[i] = *(table + state * ACTION_NUM + available_actions[i]); // table[state][available_actions[i]]
|
|
}
|
|
|
|
short ans_index;
|
|
ans_index = float_argmax(available_actions_state, available_action_length);
|
|
return available_actions_state[ans_index];
|
|
}
|
|
|
|
/*
|
|
Run Q-learning Evaluation or Training.
|
|
|
|
Args:
|
|
- float *table (2-dim array's start address)
|
|
- short *board (1-dim array's start address): chessboard's address
|
|
- bool train: train or not
|
|
- int times: how many episode to simulate
|
|
- bool plot: whether to plot the gaming process
|
|
|
|
Results:
|
|
- None
|
|
*/
|
|
void run(float *table, short *board, bool train, int times, bool plot){
|
|
short available_actions[9];
|
|
short available_actions_length;
|
|
short winner;
|
|
short choice, opponent_choice;
|
|
int state, _state;
|
|
float estimate_r, estimate_r_, real_r, r, opponent_r;
|
|
struct action a;
|
|
|
|
int win = 0;
|
|
|
|
for (int episode=0; episode<times; episode++){
|
|
reset(board);
|
|
state = state_hash(board);
|
|
while (1){
|
|
// bot choose the action
|
|
choice = bot_choose_action(table, board, state);
|
|
a.loc = choice;
|
|
a.player = BOT_SYMBOL;
|
|
|
|
estimate_r = *(table + state * ACTION_NUM + choice);
|
|
act(board, &a, &_state, &r, &opponent_r, &winner);
|
|
if (plot) show(board);
|
|
|
|
// opponent random
|
|
if (winner == 0){
|
|
opponent_choice = opponent_random_action(table, board, state_hash(board));
|
|
if (opponent_choice != -1){
|
|
a.loc = opponent_choice;
|
|
a.player = OPPONENT_SYMBOL;
|
|
act(board, &a, &_state, &opponent_r, &r, &winner);
|
|
if (plot) show(board);
|
|
}
|
|
}
|
|
get_available_actions(board, available_actions, &available_actions_length);
|
|
|
|
if ((winner != 0) || (available_actions_length == 0)){
|
|
if (plot){
|
|
printf("winner: %d, reward: %f, oppo reward: %f\n", winner, r, opponent_r);
|
|
printf("==========================================================\n");
|
|
}
|
|
real_r = r;
|
|
} else {
|
|
estimate_r_ = get_estimate_reward(table, board, _state);
|
|
real_r = r + LAMBDA * estimate_r_;
|
|
}
|
|
if (train){
|
|
// printf("update");
|
|
*(table + state * ACTION_NUM + choice) += ( LR * (real_r - estimate_r) ); // table[state][choice] += LR * (real_r - estimate_r)
|
|
}
|
|
state = _state;
|
|
|
|
if ((winner != 0) || (available_actions_length == 0)){
|
|
// printf("break\n");
|
|
if (winner == 1){
|
|
win += 1;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!train)
|
|
printf("%d/%d, %f\%\n", win, 10000, (float)win/10000);
|
|
}
|
|
|
|
int main(){
|
|
short board[9]= {0}; // tic tac toe's chessboard
|
|
float table[STATE_NUM][ACTION_NUM];
|
|
short available_actions[9];
|
|
short available_actions_length;
|
|
short winner;
|
|
short choice, opponent_choice;
|
|
int state, _state;
|
|
int estimate_r, estimate_r_, real_r, r, opponent_r;
|
|
struct action a;
|
|
|
|
srand(time(NULL));
|
|
init_table(&table[0][0]);
|
|
|
|
run(&table[0][0], board, false, 10000, false);
|
|
run(&table[0][0], board, true, 10000000, false);
|
|
run(&table[0][0], board, false, 10000, false);
|
|
|
|
}
|