# Tutorial 2: Value-Based Player¶

Week 3, Day 5: Reinforcement Learning for Games

Content creators: Mandana Samiei, Raymond Chua, Tim Lilicrap, Blake Richards

Content reviewers: Arush Tagade, Lily Cheng, Melvin Selim Atay, Kelson Shilling-Scrivo

Content editors: Melvin Selim Atay, Spiros Chavlis, Gunnar Blohm

Production editors: Namrata Bafna, Gagana B, Spiros Chavlis # Tutorial Objectives¶

In this tutorial, you will implement a value-based player.

# Setup¶

## Install dependencies¶

# @title Install dependencies
!pip install coloredlogs --quiet

from evaltools.airtable import AirtableForm

# generate airtable form

# Imports
import os
import time
import torch
import random
import logging
import coloredlogs

import numpy as np

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from tqdm.notebook import tqdm
from pickle import Unpickler

log = logging.getLogger(__name__)


## Set random seed¶

Executing set_seed(seed=seed) you are setting the seed

# @title Set random seed

# @markdown Executing set_seed(seed=seed) you are setting the seed

# For DL its critical to set the random seed so that students can have a
# baseline to compare their results to expected results.

# Call set_seed function in the exercises to ensure reproducibility.
import random
import torch

def set_seed(seed=None, seed_torch=True):
"""
Function that controls randomness. NumPy and random modules must be imported.

Args:
seed : Integer
A non-negative integer that defines the random state. Default is None.
seed_torch : Boolean
If True sets the random seed for pytorch tensors, so pytorch module
must be imported. Default is True.

Returns:
Nothing.
"""
if seed is None:
seed = np.random.choice(2 ** 32)
random.seed(seed)
np.random.seed(seed)
if seed_torch:
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

print(f'Random seed {seed} has been set.')

# In case that DataLoader is used
def seed_worker(worker_id):
"""
DataLoader will reseed workers following randomness in

Args:
worker_id: integer
ID of subprocess to seed. 0 means that
the data will be loaded in the main process

Returns:
Nothing
"""
worker_seed = torch.initial_seed() % 2**32
np.random.seed(worker_seed)
random.seed(worker_seed)


## Set device (GPU or CPU). Execute set_device()¶

# @title Set device (GPU or CPU). Execute set_device()
# especially if torch modules used.

# Inform the user if the notebook uses GPU or CPU.

def set_device():
"""
Set the device. CUDA if available, CPU otherwise

Args:
None

Returns:
Nothing
"""
device = "cuda" if torch.cuda.is_available() else "cpu"
if device != "cuda":
print("WARNING: For this notebook to perform best, "
"if possible, in the menu under Runtime -> "
"Change runtime type.  select GPU ")
else:
print("GPU is enabled in this notebook.")

return device

SEED = 2021
set_seed(seed=SEED)
DEVICE = set_device()

Random seed 2021 has been set.
WARNING: For this notebook to perform best, if possible, in the menu under Runtime -> Change runtime type.  select GPU


# @title Download the modules

# @markdown Run this cell!

import os, io, sys, shutil, zipfile
from urllib.request import urlopen

#!git clone git://github.com/raymondchua/nma_rl_games.git --quiet
REPO_PATH = 'nma_rl_games'

if os.path.exists(REPO_PATH):
shutil.rmtree(REPO_PATH)
else:

with urlopen(zipurl) as zipresp:
zfile.extractall()

print(f"Add the {REPO_PATH} in the path and import the modules.")
# add the repo in the path
sys.path.append('nma_rl_games/alpha-zero')

# @markdown Import modules designed for use in this notebook
import Arena

from utils import *
from Game import Game
from NeuralNet import NeuralNet

from othello.OthelloLogic import Board

Redownloading and unzipping the file... Please wait.

Download completed.
Add the nma_rl_games in the path and import the modules.


## Helper functions from previous tutorials¶

# @title Helper functions from previous tutorials
class OthelloGame(Game):
"""
Instantiate Othello Game
"""
square_content = {
-1: "X",
+0: "-",
+1: "O"
}

@staticmethod
def getSquarePiece(piece):
return OthelloGame.square_content[piece]

def __init__(self, n):
self.n = n

def getInitBoard(self):
# Return initial board (numpy board)
b = Board(self.n)
return np.array(b.pieces)

def getBoardSize(self):
# (a,b) tuple
return (self.n, self.n)

def getActionSize(self):
# Return number of actions, n is the board size and +1 is for no-op action
return self.n*self.n + 1

def getCanonicalForm(self, board, player):
# Return state if player==1, else return -state if player==-1
return player*board

def stringRepresentation(self, board):
return board.tobytes()

board_s = "".join(self.square_content[square] for row in board for square in row)
return board_s

def getScore(self, board, player):
b = Board(self.n)
b.pieces = np.copy(board)
return b.countDiff(player)

@staticmethod
def display(board):
n = board.shape
print("   ", end="")
for y in range(n):
print(y, end=" ")
print("")
print("-----------------------")
for y in range(n):
print(y, "|", end="")    # Print the row
for x in range(n):
piece = board[y][x]    # Get the piece to print
print(OthelloGame.square_content[piece], end=" ")
print("|")
print("-----------------------")

@staticmethod
def displayValidMoves(moves):
# Display possible moves
A=np.reshape(moves[0:-1], board.shape)
n = board.shape
print("  ")
print("possible moves")
print("   ", end="")
for y in range(n):
print(y, end=" ")
print("")
print("-----------------------")
for y in range(n):
print(y, "|", end="")    # Print the row
for x in range(n):
piece = A[y][x]    # Get the piece to print
print(OthelloGame.square_content[piece], end=" ")
print("|")
print("-----------------------")

def getNextState(self, board, player, action):
"""
Helper function to make valid move
If player takes action on board, return next (board,player)
and action must be a valid move

Args:
board: np.ndarray
Board of size n x n [6x6 in this case]
player: Integer
ID of current player
action: np.ndarray
Space of actions

Returns:
(board,player) tuple signifying next state
"""
if action == self.n*self.n:
return (board, -player)
b = Board(self.n)
b.pieces = np.copy(board)
move = (int(action/self.n), action%self.n)
b.execute_move(move, player)
return (b.pieces, -player)

def getValidMoves(self, board, player):
"""
Helper function to make valid move
If player takes action on board, return next (board,player)
and action must be a valid move

Args:
board: np.ndarray
Board of size n x n [6x6 in this case]
player: Integer
ID of current player
action: np.ndarray
Space of action

Returns:
valids: np.ndarray
Returns a fixed size binary vector
"""
valids = *self.getActionSize()
b = Board(self.n)
b.pieces = np.copy(board)
legalMoves =  b.get_legal_moves(player)
if len(legalMoves)==0:
valids[-1]=1
return np.array(valids)
for x, y in legalMoves:
valids[self.n*x+y]=1
return np.array(valids)

def getGameEnded(self, board, player):
"""
Helper function to signify if game has ended

Args:
board: np.ndarray
Board of size n x n [6x6 in this case]
player: Integer
ID of current player

Returns:
0 if not ended, 1 if player 1 won, -1 if player 1 lost
"""
b = Board(self.n)
b.pieces = np.copy(board)
if b.has_legal_moves(player):
return 0
if b.has_legal_moves(-player):
return 0
if b.countDiff(player) > 0:
return 1
return -1

def getSymmetries(self, board, pi):
"""
Get mirror/rotational configurations of board

Args:
board: np.ndarray
Board of size n x n [6x6 in this case]
pi: np.ndarray
Dimension of board

Returns:
l: list
90 degree of board, 90 degree of pi_board
"""
assert(len(pi) == self.n**2+1)  # 1 for pass
pi_board = np.reshape(pi[:-1], (self.n, self.n))
l = []

for i in range(1, 5):
for j in [True, False]:
newB = np.rot90(board, i)
newPi = np.rot90(pi_board, i)
if j:
newB = np.fliplr(newB)
newPi = np.fliplr(newPi)
l += [(newB, list(newPi.ravel()) + [pi[-1]])]
return l

class RandomPlayer():
"""
Simulates Random Player
"""

def __init__(self, game):
self.game = game

def play(self, board):
"""
Simulates game play

Args:
board: np.ndarray
Board of size n x n [6x6 in this case]

Returns:
a: int
Randomly chosen move
"""

# Compute the valid moves using getValidMoves()
valids = self.game.getValidMoves(board, 1)

# Compute the probability of each move being played (random player means this should
# be uniform for valid moves, 0 for others)
prob = valids/valids.sum()

# Pick an action based on the probabilities (hint: np.choice is useful)
a = np.random.choice(self.game.getActionSize(), p=prob)

return a


The hyperparameters used throughout the notebook.

args = dotdict({
'numIters': 1,            # In training, number of iterations = 1000 and num of episodes = 100
'numEps': 1,              # Number of complete self-play games to simulate during a new iteration.
'tempThreshold': 15,      # To control exploration and exploitation
'updateThreshold': 0.6,   # During arena playoff, new neural net will be accepted if threshold or more of games are won.
'maxlenOfQueue': 200,     # Number of game examples to train the neural networks.
'numMCTSSims': 15,        # Number of games moves for MCTS to simulate.
'arenaCompare': 10,       # Number of games to play during arena play to determine if new net will be accepted.
'cpuct': 1,
'maxDepth':5,             # Maximum number of rollouts
'numMCsims': 5,           # Number of monte carlo simulations
'mc_topk': 3,             # Top k actions for monte carlo rollout

'checkpoint': './temp/',
'numItersForTrainExamplesHistory': 20,

# Define neural network arguments
'lr': 0.001,               # lr: Learning Rate
'dropout': 0.3,
'epochs': 10,
'batch_size': 64,
'device': DEVICE,
'num_channels': 512,
})


# Section 1: Train a value function from expert game data¶

Time estimate: ~25mins

Now that we have the game set up and working, we can build a (hopefully) smarter player by learning a value function using expert game data. Our player can then use this value function to decide what moves to make.

Goal: Learn how to train a value function from a dataset of games played by an expert.

Exercise:

• Load a dataset of expert generated games.

• Train a network to minimize MSE for win/loss predictions given board states sampled throughout the game. This will be done on a very small number of games. We will provide a network trained on a larger dataset.

## Section 1.1. Load expert data¶

def loadTrainExamples(folder, filename):
"""
Helper function to load Training examples

Args:
folder: string
Path specifying training examples
filename: string
File name of training examples

Returns:
trainExamplesHistory: list
"""
trainExamplesHistory = []
modelFile = os.path.join(folder, filename)
examplesFile = modelFile + ".examples"
if not os.path.isfile(examplesFile):
r = input("Continue? [y|n]")
if r != "y":
sys.exit()
else:
with open(examplesFile, "rb") as f:
return trainExamplesHistory

path = "nma_rl_games/alpha-zero/pretrained_models/data/"

File with train examples found. Loading it...

Loading done!


## Section 1.2. Define the Neural Network Architecture for Othello¶

We will (somewhat arbitrarily) use a deep CNN with 4 convolutional layers and 4 linear layers with ReLU transfer functions and batch normalization. One reason why convolutions are interesting here is because they can extract the local value of moves on the board regardless of board position; convolution would thus be able to extract the translation-invariant aspects of the play.

For the Value Network network, the 3rd linear layer represents the policy and the 4th linear layer (output) represents the value function. The value function is a weighted sum over all policies.

We can do this by assuming that the weights between linear layers 3 and 4 approximate the value-action function $$w_{l_{34}}=Q^{\pi}(s,a)$$ in:

(130)$\begin{equation} V^{\pi}(s) = \sum_{a}{\pi(a,s) \cdot Q^{\pi}(s,a)} \end{equation}$

Note: OthelloNet has 2 outputs:

1. log-softmax of linear layer 3

2. tanh of linear layer 4

### Coding Exercise 1.2: Implement the NN OthelloNNet for Othello¶

We implement most of OthelloNNet below but please complete the code to get the final outputs

class OthelloNNet(nn.Module):
"""
Instantiate Othello Neural Net with following configuration
nn.Conv2d(1, args.num_channels, 3, stride=1, padding=1) # Convolutional Layer 1
nn.Conv2d(args.num_channels, args.num_channels, 3, stride=1, padding=1) # Convolutional Layer 2
nn.Conv2d(args.num_channels, args.num_channels, 3, stride=1) # Convolutional Layer 3
nn.Conv2d(args.num_channels, args.num_channels, 3, stride=1) # Convolutional Layer 4
nn.BatchNorm2d(args.num_channels) X 4
nn.Linear(args.num_channels * (self.board_x - 4) * (self.board_y - 4), 1024) # Fully-connected Layer 1
nn.Linear(1024, 512) # Fully-connected Layer 2
nn.Linear(512, self.action_size) # Fully-connected Layer 3
nn.Linear(512, 1) # Fully-connected Layer 4
"""

def __init__(self, game, args):
"""
Initialise game parameters

Args:
game: OthelloGame instance
Instance of the OthelloGame class above;
args: dictionary
Instantiates number of iterations and episodes, controls temperature threshold, queue length,
arena, checkpointing, and neural network parameters:
learning-rate: 0.001, dropout: 0.3, epochs: 10, batch_size: 64,
num_channels: 512

Returns:
Nothing
"""
self.board_x, self.board_y = game.getBoardSize()
self.action_size = game.getActionSize()
self.args = args

super(OthelloNNet, self).__init__()
self.conv1 = nn.Conv2d(1, args.num_channels, 3, stride=1, padding=1)
self.conv2 = nn.Conv2d(args.num_channels, args.num_channels, 3, stride=1,
self.conv3 = nn.Conv2d(args.num_channels, args.num_channels, 3, stride=1)
self.conv4 = nn.Conv2d(args.num_channels, args.num_channels, 3, stride=1)

self.bn1 = nn.BatchNorm2d(args.num_channels)
self.bn2 = nn.BatchNorm2d(args.num_channels)
self.bn3 = nn.BatchNorm2d(args.num_channels)
self.bn4 = nn.BatchNorm2d(args.num_channels)

self.fc1 = nn.Linear(args.num_channels * (self.board_x - 4) * (self.board_y - 4), 1024)
self.fc_bn1 = nn.BatchNorm1d(1024)

self.fc2 = nn.Linear(1024, 512)
self.fc_bn2 = nn.BatchNorm1d(512)

self.fc3 = nn.Linear(512, self.action_size)

self.fc4 = nn.Linear(512, 1)

def forward(self, s):
"""
Controls forward pass of OthelloNNet

Args:
s: np.ndarray
Array of size (batch_size x board_x x board_y)

Returns:
Probability distribution over actions at the current state and the value of the current state.
"""
s = s.view(-1, 1, self.board_x, self.board_y)                # batch_size x 1 x board_x x board_y
s = F.relu(self.bn1(self.conv1(s)))                          # batch_size x num_channels x board_x x board_y
s = F.relu(self.bn2(self.conv2(s)))                          # batch_size x num_channels x board_x x board_y
s = F.relu(self.bn3(self.conv3(s)))                          # batch_size x num_channels x (board_x-2) x (board_y-2)
s = F.relu(self.bn4(self.conv4(s)))                          # batch_size x num_channels x (board_x-4) x (board_y-4)
s = s.view(-1, self.args.num_channels * (self.board_x - 4) * (self.board_y - 4))

s = F.dropout(F.relu(self.fc_bn1(self.fc1(s))), p=self.args.dropout, training=self.training)  # batch_size x 1024
s = F.dropout(F.relu(self.fc_bn2(self.fc2(s))), p=self.args.dropout, training=self.training)  # batch_size x 512

pi = self.fc3(s)  # batch_size x action_size
v = self.fc4(s)   # batch_size x 1
#################################################
## TODO for students: Compute the outputs of OthelloNNet in this order
# 1. Log softmax of linear layer 3
# 2. tanh of linear layer 4
# Fill out function and remove
raise NotImplementedError("Calculate the probability distribution and the value")
#################################################
# Returns probability distribution over actions at the current state and the value of the current state.
return ..., ...

atform.add_event('Coding Exercise 1.2: Implement the NN OthelloNNet for Othello')


Click for solution

## Section 1.3. Define the Value network¶

Next we need to implement the training of the network we created above. We want to train it to approximate the value function - we will use real examples (the expert data from above) to train it. So we need to specify the standard initialization, training, prediction and loss functions.

Note: During training, the ground truth will be uploaded from the MCTS simulations available at checkpoint_x.path.tar.examples.

### Coding Exercise 1.3: Implement the ValueNetwork¶

class ValueNetwork(NeuralNet):
"""
Initiates the Value Network
"""

def __init__(self, game):
"""
Initialise network parameters

Args:
game: OthelloGame instance
Instance of the OthelloGame class above;

Returns:
Nothing
"""
self.nnet = OthelloNNet(game, args)
self.board_x, self.board_y = game.getBoardSize()
self.action_size = game.getActionSize()
self.nnet.to(args.device)

def train(self, games):
"""
Function to train value network

Args:
games: list
List of examples with each example is of form (board, pi, v)

Returns:
Nothing
"""
for examples in games:
for epoch in range(args.epochs):
print('EPOCH ::: ' + str(epoch + 1))
self.nnet.train()
v_losses = []   # To store the losses per epoch
batch_count = int(len(examples) / args.batch_size)  # len(examples)=200, batch-size=64, batch_count=3
t = tqdm(range(batch_count), desc='Training Value Network')
for _ in t:
sample_ids = np.random.randint(len(examples), size=args.batch_size)  # Read the ground truth information from MCTS simulation using the loaded examples
boards, pis, vs = list(zip(*[examples[i] for i in sample_ids]))  # Length of boards, pis, vis = 64
boards = torch.FloatTensor(np.array(boards).astype(np.float64))
target_vs = torch.FloatTensor(np.array(vs).astype(np.float64))

# Predict
# To run on GPU if available
boards, target_vs = boards.contiguous().to(args.device), target_vs.contiguous().to(args.device)

#################################################
## TODO for students:
## 1. Compute the value predicted by OthelloNNet() ##
## 2. First implement the loss_v() function below and then use it to update the value loss. ##
# Fill out function and remove
raise NotImplementedError("Compute the output")
#################################################
# Compute output
_, out_v = ...
l_v = ...  # Total loss

# Record loss
v_losses.append(l_v.item())
t.set_postfix(Loss_v=l_v.item())

# Compute gradient and do SGD step
l_v.backward()
optimizer.step()

def predict(self, board):
"""
Function to perform prediction

Args:
board: np.ndarray
Board of size n x n [6x6 in this case]

Returns:
v: OthelloNet instance
Data of the OthelloNet class instance above;
"""
# Timing
start = time.time()

# Preparing input
board = torch.FloatTensor(board.astype(np.float64))
board = board.contiguous().to(args.device)
board = board.view(1, self.board_x, self.board_y)
self.nnet.eval()
_, v = self.nnet(board)
return v.data.cpu().numpy()

def loss_v(self, targets, outputs):
"""
Calculates Mean squared error

Args:
targets: np.ndarray
Ground Truth variables corresponding to input
outputs: np.ndarray
Predictions of Network

Returns:
MSE Loss calculated as: square of the difference between your model's predictions
and the ground truth and average across the whole dataset
"""
#################################################
## TODO for students: Please compute Mean squared error and return as output. ##
# Fill out function and remove
raise NotImplementedError("Calculate the loss")
#################################################
# Mean squared error (MSE)
return ...

def save_checkpoint(self, folder='checkpoint', filename='checkpoint.pth.tar'):
"""
Code Checkpointing

Args:
folder: string
Path specifying training examples
filename: string
File name of training examples

Returns:
Nothing
"""
filepath = os.path.join(folder, filename)
if not os.path.exists(folder):
print("Checkpoint Directory does not exist! Making directory {}".format(folder))
os.mkdir(folder)
else:
print("Checkpoint Directory exists! ")
torch.save({'state_dict': self.nnet.state_dict(),}, filepath)
print("Model saved! ")

"""

Args:
folder: string
Path specifying training examples
filename: string
File name of training examples

Returns:
Nothing
"""
# https://github.com/pytorch/examples/blob/master/imagenet/main.py#L98
filepath = os.path.join(folder, filename)
if not os.path.exists(filepath):
raise ("No model in path {}".format(filepath))

atform.add_event('Coding Exercise 1.3: Implement the ValueNetwork')


Click for solution

## Section 1.4. Train the value network and observe the MSE loss progress¶

Important: Run this cell ONLY if you do not have access to the pretrained models in the rl_for_games repository. The below cell will run the training algorithm and will take a while to complete…

We provide a fully trained Value net in the rl_for_games repository that will automatically load below.

if not os.listdir('nma_rl_games/alpha-zero/pretrained_models/models/'):
set_seed(seed=SEED)
game = OthelloGame(6)
vnet = ValueNetwork(game)


# Section 2: Use a trained value network to play games¶

Time estimate: ~25mins

Now that we have our value network all set up and trained, we’re ready to test it by using it to play games.

Goal: Learn how to use a value function in order to make a player that works better than a random player.

Exercise:

• Sample random valid moves and use the value function to rank them

• Choose the best move as the action and play it Show that doing so beats the random player

Hint: You might need to change the sign of the value based on the player.

## Coding Exercise 2: Value-based player¶

Let’s first initialize a new game and load in a pre-trained Value function.

model_save_name = 'ValueNetwork.pth.tar'
path = "nma_rl_games/alpha-zero/pretrained_models/models/"
set_seed(seed=SEED)
game = OthelloGame(6)
vnet = ValueNetwork(game)

Random seed 2021 has been set.


Next, we can create a player that makes use of the value function to decide what best action to take next.

How do we choose the best move using our value network? We will simply compute the expected value (predicted value) of all possible moves and then select the best one based on which next state has the highest value.

class ValueBasedPlayer():
"""
Simulate Value Based Player
"""

def __init__(self, game, vnet):
"""
Initialise value based player parameters

Args:
game: OthelloGame instance
Instance of the OthelloGame class above;
vnet: Value Network instance
Instance of the Value Network class above;

Returns:
Nothing
"""
self.game = game
self.vnet = vnet

def play(self, board):
"""
Simulate game play

Args:
board: np.ndarray
Board of size n x n [6x6 in this case]

Returns:
candidates: List
Collection of tuples describing action and values of future predicted states
"""
valids = self.game.getValidMoves(board, 1)
candidates = []
max_num_actions = 4
va = np.where(valids)
va_list = va.tolist()
random.shuffle(va_list)
#################################################
## TODO for students: In the first part, please return the next board state using getNextState(), then predict
## the value of next state using value network, and finally add the value and action as a tuple to the candidate list.
## Note that you need to reverse the sign of the value. In zero-sum games the players flip every turn. In detail, we train
## a value function to think about the game from one player's (either black or white) perspective. In order to use the same
## value function to estimate how good the position is for the other player, we need to take the negative of the output of
## the function. E.g., if the value function is trained for white's perspective and says that white is likely to win the game
## from the current state with an output of 0.75, this similarly means that it would suggest that black is very unlikely (-0.75)
## to win the game from the current state.##
# Fill out function and remove
raise NotImplementedError("Implement the value-based player")
#################################################
for a in va_list:
# Return next board state using getNextState() function
nextBoard, _ = ...
# Predict the value of next state using value network
value = ...
# Add the value and the action as a tuple to the candidate lists, note that you might need to change the sign of the value based on the player
candidates += ...

if len(candidates) == max_num_actions:
break

# Sort by the values
candidates.sort()

# Return action associated with highest value
return candidates

# Playing games between a value-based player and a random player
set_seed(seed=SEED)
num_games = 20
player1 = ValueBasedPlayer(game, vnet).play
player2 = RandomPlayer(game).play
arena = Arena.Arena(player1, player2, game, display=OthelloGame.display)
## Uncomment the code below to check your code!
# result = arena.playGames(num_games, verbose=False)
# print(f"\n\n{result}")

Random seed 2021 has been set.


Click for solution

(14, 6, 0)


# Summary¶

In this tutorial, you have learned about value-based players and compared them to a random player.