Professional Documents
Culture Documents
In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt
In [2]:
In [3]:
env.reset()
# env.render()
# env.step(2)
Out[3]:
In [4]:
# env.step(2)
In [5]:
# env.step(2)
In [6]:
print(env.observation_space)
env.action_space
Discrete(16)
Out[6]:
Discrete(4)
In [7]:
env.P[2][2]
Out[7]:
[(1.0, 3, 0.0, False)]
In [8]:
for i in range(no_of_iterations):
Q_value.append(np.sum(next_states_rewards))
value_table[state] = max(Q_value)
# we will check whether we have reached the convergence i.e whether the differenc
e
# between our value table and updated value table is very small. But how do we kn
ow it is very
# small? We set some threshold and then we will see if the difference is less
# than our threshold, if it is less, we break the loop and return the value funct
ion as optimal
# value function
return value_table
In [9]:
def extract_policy(value_table, gamma = 0.9):
# select the action which has maximum Q value as an optimal action of the state
policy[state] = np.argmax(Q_table)
return policy
In [10]:
optimal_value_function = value_iteration(env=env,gamma=0.9)
optimal_value_function
In [11]:
optimal_policy = extract_policy(optimal_value_function, gamma=0.9)
optimal_policy
Out[11]:
Out[11]:
array([1., 2., 1., 0., 1., 0., 1., 0., 2., 1., 1., 0., 0., 2., 2., 0.])
In [12]:
opt_pol = optimal_policy.reshape(4,4)
print('THE OPTIMAL POLICY IS \n ',opt_pol)