You are on page 1of 3

##################

## Example 11.0: Baird counterexample

## using Emphatic-TD algorithm on p. 304

##################

# Import packages and functions.

import numpy as np
import matplotlib.pyplot as plt

which = lambda status: np.arange(len(status))[status]

# parameter settings

n1 = 7 # number of states
n2 = n1 # number of working states

alpha = 0.03
#prob_eps = 0.1

discount = 0.99
#eps = 1.0e-7 # should be small enough

# Two possible actions: dashed = 0; solid = 1

action_word = np.array(["dashed", "solid"])

action = np.array([0, 1])


action_num = 2

# building environment

def step(state, move):


reward = 0
if move == 0: # dashed action
next_state = int(np.random.choice([0, 1, 2, 3, 4, 5]))
else: # solid action
next_state = 6
return {"next_state": next_state, "reward": reward}

def b_move(state):
rand_num = np.random.uniform(low = 0, high = 1, size = 1)
if rand_num <= 6/7:
move = 0 # dashed action
else:
move = 1 # solid action
return move

b_action_prob = np.array([6/7, 1/7])

pi_action_prob = np.array([0.0, 1.0])

rho = pi_action_prob / b_action_prob

rho
def v_value(state, w):
if state == 6:
value = w[6] + 2 * w[7]
else:
value = 2 * w[state] + w[7]
return value

def grad_v(state, w):


grad = np.zeros(len(w))
if state == 6:
grad[6] = 1.0
grad[7] = 2.0
else:
grad[state] = 2.0
grad[7] = 1.0
return grad

# main loop

seed = 543
np.random.seed(seed) # Set random seed for reproducibility.

interest = [1, 1, 1, 1, 1, 1, 1]

#step_num = 1000

step_num = 1000 # for testing purpose only

w = np.array([1, 1, 1, 1, 1, 1, 10, 1], dtype = float) # initial weights

w_matrix = np.zeros((step_num + 1, 8))

w_matrix[0, :] = w # initial weights

state = int(np.random.choice(range(n2))) # random initial state

for i in range(1, step_num + 1):


# Choose an action.
move = b_move(state)
step_obj = step(state, move)
next_state = step_obj["next_state"]
reward = step_obj["reward"]
if move == 1:
# Update the weights.
current_value = v_value(state, w)
next_value = v_value(next_state, w)
delta = reward + discount * next_value - current_value
grad = grad_v(state, w)
w[state] = w[state] + alpha * rho[1] * delta * grad[state]
w[7] = w[7] + alpha * rho[1] * delta * grad[7]
w_matrix[i, :] = np.copy(w)
state = next_state

print(w_matrix)

w_matrix[-1,:] # final weights

# Calculate state values.


state_value = np.zeros(7)

for i in range(7):
state_value[i] = v_value(i, w)

print(state_value)

# Figure 11.6

colors = ["black", "red", "green", "blue", "yellow", "cyan", "magenta", "purple"]

linestyles = ["solid", "dashed", "dotted", "dashdot"]

plt.figure("Figure 11.6")
for j in range(8):
plt.plot(range(step_num + 1), w_matrix[:, j],
color = colors[j], linestyle = linestyles[j%4])

plt.xlabel("step")
plt.ylabel("weight value")
plt.legend(['w1', 'w2', 'w3', 'w4', 'w5', 'w6', 'w7', 'w8'], loc = "best", frameon
= False)
plt.show()

##################

You might also like