-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_connect4bot_MCTS.py
567 lines (455 loc) · 18.1 KB
/
run_connect4bot_MCTS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
from __future__ import print_function
from collections import deque
from connect4bot_MCTS import MCTS_Trainer
from connect4game import move
import tensorflow as tf
import numpy as np
import random
import sys
import time
import copy
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
sess = tf.Session()
# The learning rate for this program decays at each 25% mark through the training
global_step = tf.Variable(0, trainable=False)
minibatches = 400
boundaries = [1, 2, 3]
boundaries = [x * (minibatches//4) for x in boundaries]
values = [0.001, 0.00005, 0.00003, 0.00001]
learning_rate = tf.train.piecewise_constant(global_step, boundaries, values)
# Using the learning rate in a standard gradient descent optimizer
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
### Model parameters
# Nodes in the first hidden layer
num_hidden1 = 1024
# Nodes in the second hidden layer
num_hidden2 = 1024
# Board size parameters
num_rows = 6
num_cols = 7
# Batch size for each update
batch_size = 500
# Number of MCTS steps
MCTS_rollouts = 250
# Epsilon-greedy parameter NOTE: Since I'm tracking full MC rollouts, this is not used
epsilon = 0.1
# Mixing parameter
lambda_mix = 0.5
def the_model(data):
### Define the neural network
# The input connects to this fully connected layer (6x7x2)[84] -> 1024
W_fc1 = tf.get_variable("W_fc1", shape=[num_rows*num_cols*2, num_hidden1],
initializer=tf.contrib.layers.xavier_initializer()
)
b_fc1 = tf.get_variable("b_fc1", [num_hidden1],
initializer=tf.constant_initializer()
)
# The previous layer then connects to the next fully connected layer
W_fc2 = tf.get_variable("W_fc2", shape=[num_hidden1, num_hidden2],
initializer=tf.contrib.layers.xavier_initializer()
)
b_fc2 = tf.get_variable("b_fc2", [num_hidden2],
initializer=tf.constant_initializer()
)
# That second fully connected layer is fully connected to the outputs of the models
W_fcf = tf.get_variable("W_fcf", shape=[num_hidden2, num_rows*num_cols + 1],
initializer=tf.contrib.layers.xavier_initializer()
)
b_fcf = tf.get_variable("b_fcf", [num_rows*num_cols + 1],
initializer=tf.constant_initializer()
)
# Reshape the data and plug it into the network
flattener = tf.reshape(data, [-1, num_rows*num_cols*2])
h_fc1 = tf.nn.relu(tf.matmul(flattener, W_fc1) + b_fc1)
h_fc2 = tf.nn.relu(tf.matmul(h_fc1, W_fc2) + b_fc2)
# Finally, return the output of the model, which should be [num_rows*num_cols + 1, 1]
o = tf.matmul(h_fc2, W_fcf) + b_fcf
p, v = o[:,0:-1], o[:,-1]
return p, v
# Restore parameters from a past run?
restore_params = False
# Set up the model
trainer = MCTS_Trainer(sess,
optimizer,
the_model,
restore_params,
batch_size,
num_hidden1,
num_hidden2,
)
saver = tf.train.Saver()
# Function for converting a given board into a string
def concatenate_state_data(Board):
result = ''
for row in Board:
for element in row:
result += str(element)
return result
# Functions for updating all N's and Q's for boards that were used
def update_N(N_index, indices, actions):
for a, i in enumerate(indices):
N_index[i][actions[a]] = N_index[i][actions[a]] + 1
def update_Q(Q_index, N_index, V_index, lambda_mix, score, indices, actions):
fun_list = []
for a, i in enumerate(indices):
# The indices are lined up such that they're in the same order as moves.
# Also, Q should be switched between every move because if Player 1 won,
# then Player 2 must have lost.
# So you always start at "score", and then switch score to negative for the next one.
# If score is 1, then first move is +1, next move is -1, next move is +1, etc.
# If score is -1, then first move is -1 (lost), next move is +1, next is -1...
# If score is 0, then score is 0 the entire way through.
Q_index[i][actions[a]] = Q_index[i][actions[a]] + (1/N_index[i][actions[a]]) * (
+ (1 - lambda_mix) * V_index[i][actions[a]]
+ lambda_mix * score
- Q_index[i][actions[a]]
)
fun_list.append(Q_index[i][actions[a]])
score = -score
# Function to remove indices not associated with current/future boards in a rollout
def index_cleaner(Board, Board_index, pred_index, P_index, Q_index, N_index, V_index):
# Convert updated board to its string
Board_string = concatenate_state_data(Board)
# Begin a list of now-impossible strings
unwanted_list = []
# Check if the pieces in the new board match pieces in the indexed board
for index, index_string in enumerate(Board_index):
if len([c for c,d in zip(Board_string, index_string) if c!="0" and c!=d]) > 0:
unwanted_list.append(index)
#print(Board_string)
#print(index_string)
# Re-make all the lists as only the matching indices
Board_index = [i for j,i in enumerate(Board_index) if j not in unwanted_list]
pred_index = [i for j,i in enumerate(pred_index) if j not in unwanted_list]
P_index = [i for j,i in enumerate(P_index) if j not in unwanted_list]
Q_index = [i for j,i in enumerate(Q_index) if j not in unwanted_list]
N_index = [i for j,i in enumerate(N_index) if j not in unwanted_list]
V_index = [i for j,i in enumerate(V_index) if j not in unwanted_list]
return Board_index, pred_index, P_index, Q_index, N_index, V_index
# Saved parameters for user output and checking
one_wins = 0
two_wins = 0
old_one = 0
old_two = 0
game_list = []
first_wins = []
second_wins = []
one_buffer = 0
two_buffer = 0
one_wins_list = []
two_wins_list = []
tie_list = []
# Constants for deciding actions
Q_coeff = 1
P_coeff = 1
UCB_coeff = 1
dirichlet_factor = 5
dirichlet_mult_constant = 0.5
# Prepare a plot
fig = plt.figure(1)
plt.xlabel("Batch number")
plt.ylabel("Win percentage Versus Random")
red = mpatches.Patch(color='red', label='Wins as Player 1')
blue = mpatches.Patch(color='blue', label='Wins as Player 2')
plt.legend(handles=[red, blue], loc=2)
plt.grid(True)
fig.canvas.draw()
plt.show(block=False)
# A loop over the number of minibatches
for batch_num in range(minibatches):
#In each step, you do 500 moves, and record the state and action output
state_list = np.zeros((batch_size, 6, 7, 2), dtype=np.float32)
action_target_list = np.zeros((batch_size, 42), dtype=np.float32)
prediction_list = np.zeros((batch_size,), dtype=np.float32)
outcome_list = np.zeros((batch_size,), dtype=np.float32)
Game_start = 1
# A check for whether a test game should be played after a model update
play_game = 1
# A loop over the batch size, in each minibatch
for mini_step in range(batch_size):
# If it's a new game
if Game_start == 1:
# Reset the environment and rewards
Board = np.zeros((6,7), dtype=int)
feed_board = np.zeros((1,6,7,2), dtype=np.float32)
new_feed_board = np.zeros((1,6,7,2), dtype=np.float32)
player_num = 1
other_player = 2
Reward = 0
action = 0
start_step = mini_step
Board_index = []
pred_index = []
P_index = []
N_index = []
V_index = []
Q_index = []
index_list = []
action_list = []
Game_start = 0
# The game has been initialized, or is continuing.
# Store the board state.
state_list[mini_step, :, :, :] = feed_board[:]
# Set the MCTS player numbers
MCTS_player_num = player_num
MCTS_other_player = other_player
# Check to see if the current board has been checked before in this run:
Board_string = concatenate_state_data(Board)
if Board_string in Board_index:
root_index = Board_index.index(Board_string)
else:
# Calculate prior probabilities of moves and a win prediction of the current state
MCTS_P, prediction = trainer.sampleAction(feed_board)
# Calculate sample values for the following states
MCTS_V = trainer.sampleValues(feed_board, MCTS_P, MCTS_player_num)
# Set N and Q to all zeros
MCTS_N = np.zeros((42,), dtype=int)
MCTS_Q = np.zeros((42,), dtype=np.float32)
# Add these values to the indexes
Board_index.append(Board_string)
pred_index.append(prediction)
P_index.append(MCTS_P)
V_index.append(MCTS_V)
N_index.append(MCTS_N)
Q_index.append(MCTS_Q)
# Note down what the index of this board state is
root_index = len(Board_index) - 1
# Add in Dirichlet noise to the root P values (NOTE: Lower dirichlet factor -> numbers more spread out)
D_mask = P_index[root_index] != 0
P_index[root_index][D_mask] = P_index[root_index][D_mask] + dirichlet_mult_constant * np.random.dirichlet(
np.ones(np.count_nonzero(P_index[root_index]), dtype=int) * [dirichlet_factor]
)
# Store the prediction of how good the current state is
prediction_list[mini_step] = prediction
# Some monitoring parameters
wincheck = 0
losscheck = 0
tiecheck = 0
moves = 0
for rollout_num in range(MCTS_rollouts):
#Order of operations:
#1) Pick a move using P/N/Q/UCB/Dirichlet
#2) Make that move, get a new MCTS_feed_board
#3) Use MCTS_feed_board to sample a new action, using P/Q/N, choose greedily
#4) Do (2,3) until the game is over
#5) update MCTS_N/Q/P
# Add the root to the list of all indices to update
index_list = [root_index]
# Set MCTS_player_num and MCTS_other_player to the main players
MCTS_player_num = player_num
MCTS_other_player = other_player
# Determine an action, among all legal actions
pre_action_matrix = (Q_coeff * Q_index[root_index]
+ P_coeff * P_index[root_index]/(1+N_index[root_index]))
masked_matrix = np.ma.masked_equal(pre_action_matrix, 0.0, copy=False)
if sum(N_index[root_index])<2:
masked_matrix += np.inf
else:
masked_matrix += UCB_coeff * np.sqrt(np.log(sum(N_index[root_index]))/N_index[root_index])
action_index = masked_matrix.argmax()
action_list = [action_index]
first_column = (action_index % 7) + 1
# Perform the move on the starting board, and make that the MCTS board
MCTS_Board, MCTS_Reward = move(Board, first_column)
moves += 1
# Turn the MCTS board into a feed board for the opposing player, for their turn
MCTS_feed_board = np.zeros((1,6,7,2), dtype=np.float32)
if not MCTS_Reward:
for row in range(MCTS_Board.shape[0]):
for col in range(MCTS_Board.shape[1]):
if MCTS_Board[row, col] == MCTS_other_player:
MCTS_feed_board[0, row, col, 0] = 1
elif MCTS_Board[row, col] != 0:
MCTS_feed_board[0, row, col, 1] = 1
if MCTS_player_num == 1:
MCTS_other_player = 1
MCTS_player_num = 2
else:
MCTS_player_num = 1
MCTS_other_player = 2
elif MCTS_Reward == -1: #Tie game, z_value is 0
update_N(N_index, index_list, action_list)
update_Q(Q_index, N_index, V_index, lambda_mix, 0, index_list, action_list)
tiecheck += 1
else: #Since this is the first move from the board position, winner must be current player
update_N(N_index, index_list, action_list)
update_Q(Q_index, N_index, V_index, lambda_mix, 1, index_list, action_list)
wincheck += 1
# Now that the first move has been selected, play the rest of the game using just P/Q/N
while MCTS_Reward == 0:
# Check if the current board is already saved in the index
Board_string = concatenate_state_data(MCTS_Board)
if Board_string in Board_index:
index = Board_index.index(Board_string)
else:
# Calculate prior probabilities of moves and a win prediction of the current state
MCTS_P, prediction = trainer.sampleAction(MCTS_feed_board)
# Calculate smple values for the following states
MCTS_V = trainer.sampleValues(MCTS_feed_board, MCTS_P, MCTS_player_num)
# Set N and Q to all zeros
MCTS_N = np.zeros((42,), dtype=int)
MCTS_Q = np.zeros((42,), dtype=np.float32)
# Add these values to the indexes
Board_index.append(Board_string)
pred_index.append(prediction)
P_index.append(MCTS_P)
V_index.append(MCTS_V)
N_index.append(MCTS_N)
Q_index.append(MCTS_Q)
# Note down what the index of this board state is
index = len(Board_index) - 1
# Add the current board to the overall index list for later updates
index_list.append(index)
# Use the indexed information to find the max of Q(s, a) + u(s, a) in this state
action_matrix = Q_coeff * Q_index[index] + P_coeff * P_index[index]/(1 + N_index[index])
masked_matrix = np.ma.masked_equal(action_matrix, 0.0, copy=False)
action_index = masked_matrix.argmax()
action_list.append(action_index)
column = (action_index % 7) + 1
# Place a piece in that column
MCTS_Board, MCTS_Reward = move(MCTS_Board, column)
# Make a new feed board, should be from the perspective of the opposing player
MCTS_feed_board = np.zeros((1,6,7,2), dtype=np.float32)
if not MCTS_Reward: #If not, MCTS_feed_board is all zeros
for row in range(MCTS_Board.shape[0]):
for col in range(MCTS_Board.shape[1]):
if MCTS_Board[row, col] == MCTS_other_player:
MCTS_feed_board[0, row, col, 0] = 1
elif MCTS_Board[row, col] != 0:
MCTS_feed_board[0, row, col, 1] = 1
if MCTS_player_num == 1:
MCTS_other_player = 1
MCTS_player_num = 2
else:
MCTS_player_num = 1
MCTS_other_player = 2
elif MCTS_Reward == -1: #Tie game, z_value is 0
update_N(N_index, index_list, action_list)
update_Q(Q_index, N_index, V_index, lambda_mix, 0, index_list, action_list)
tiecheck += 1
elif MCTS_player_num == player_num: #If the winner was the current player, z=1
update_N(N_index, index_list, action_list)
update_Q(Q_index, N_index, V_index, lambda_mix, 1, index_list, action_list)
wincheck += 1
else: #Otherwise the winner is the opposing player, z=-1
update_N(N_index, index_list, action_list)
update_Q(Q_index, N_index, V_index, lambda_mix, -1, index_list, action_list)
losscheck += 1
# By this point, all of the Monte Carlo rollouts have finished, and a move is chosen. If there is
# a tie, a move is randomly selected between the tied moves.
action = np.argmax(np.random.random(N_index[root_index].shape) * (N_index[root_index]==N_index[root_index].max()))
# Store a normalized vector of the visit counts as the action target for the given board
action_target_list[mini_step,:] = N_index[root_index] / MCTS_rollouts
# Perform the move
Board, Reward = move(Board, (action % 7) + 1)
print("Visits to each move:")
print(np.around(np.reshape(N_index[root_index], [6,7]), 3))
print("Current Board:")
print(Board)
print("After {} MC runs, column {} was selected.".format(moves, (action % 7) + 1))
print("Progress: {} Updates done, {}/{} in the batch".format(batch_num, mini_step+1, batch_size))
# Translate the new board state into a feed_board for the next move
feed_board = np.zeros((1,6,7,2), dtype=np.float32)
if not Reward:
for row in range(Board.shape[0]):
for col in range(Board.shape[1]):
if Board[row, col] == other_player:
feed_board[0, row, col, 0] = 1
elif Board[row, col] != 0:
feed_board[0, row, col, 1] = 1
if player_num == 1:
other_player = 1
player_num = 2
else:
player_num = 1
other_player = 2
elif Reward == -1:
outcome_list[start_step:mini_step+1] = [i for i in [0]*(mini_step+1-start_step)]
Game_start = 1
else:
outcome_list[start_step:mini_step+1] = [i for i in [1]*(mini_step+1-start_step)]
if player_num == 1:
outcome_list[start_step+1:mini_step:2] = [i for i in [-1]*((mini_step-start_step)//2)]
else:
outcome_list[start_step:mini_step:2] = [i for i in [-1]*((mini_step+1-start_step)//2)]
Game_start = 1
# By this point, we have run all of the moves in the batch. It's time to update the neural network.
trainer.updateModel(state_list, action_target_list, prediction_list, outcome_list)
# Save the network.
save_path = saver.save(sess, "/tmp/simple/model.ckpt")
print("Model saved in path: %s" % save_path)
# Play games against a random opponent to check progress
if play_game:
go_first = np.zeros([3]) #Format is win/loss/tie
go_second = np.zeros([3])
for fake_game in range(1000):
# Reset the environment and rewards
Board = np.zeros((6,7), dtype=int)
feed_board = np.zeros((1,6,7,2), dtype=np.float32)
# Network is Player 1 for 500 games, then Player 2 for 500 games, vs. random player
if fake_game < 500:
network_player = 1
else:
network_player = 2
current_player = 1
Reward = 0
while Reward == 0:
if current_player == network_player:
actions, _ = trainer.sampleAction(feed_board)
masked_actions = np.ma.masked_equal(actions, 0.0, copy=False)
action = masked_actions.argmax()
else:
action = np.random.choice(range(7),1)[0]
while Board[0,action] != 0:
action = np.random.choice(range(7),1)[0]
column = (action % 7) + 1
Board, Reward = move(Board, column)
# Swap player's turn
if not Reward:
if current_player == 1:
current_player = 2
else:
current_player = 1
# Reset, then reformat the current board into [1 x 6 x 7 x 2] for the next move (if applicable)
feed_board = np.zeros((1,6,7,2), dtype=np.float32)
for row in range(Board.shape[0]):
for col in range(Board.shape[1]):
if Board[row, col] == current_player:
feed_board[0, row, col, 0] = 1
elif Board[row, col] != 0:
feed_board[0, row, col, 1] = 1
if Reward == -1: #Indicates a tie
if network_player == 1:
go_first[2] += 1
else:
go_second[2] += 1
elif current_player == network_player: #then the reward must be 1, and network player must have won
if network_player == 1:
go_first[0] += 1
else:
go_second[0] += 1
else: #A player won, but it was not the network_player
if network_player == 1:
go_first[1] += 1
else:
go_second[1] += 1
# Print after games against the random player
if play_game:
print("Going 1st: {} wins, {} losses, {} ties".format(*go_first))
print("Going 2nd: {} wins, {} losses, {} ties".format(*go_second))
game_list.append(batch_num+1)
first_wins.append(go_first[0]/5)
second_wins.append(go_second[0]/5)
one_wins_list.append(one_buffer)
two_wins_list.append(two_buffer)
tie_list.append(100-one_buffer-two_buffer)
# Print to the plot the wins against the random player when the neural network
# is playing as Player 1 and as Player 2.
plt.plot(game_list, first_wins, 'r--', game_list, second_wins, 'b--')
plt.axis([1, batch_num+1, 0, 100])
fig.canvas.draw()
time.sleep(0.01)
sys.stdout.flush()
# This makes sure the ending window doesn't close
plt.show()