You are on page 1of 3

ASSIGNMENT - 3

In [1]:

import gym
import numpy as np
import matplotlib.pyplot as plt

In [2]:

env = gym.make("FrozenLake-v1", render_mode="human" ,is_slippery=False)

In [3]:

env.reset()
# env.render()
# env.step(2)
Out[3]:

(0, {'prob': 1})

In [4]:
# env.step(2)

In [5]:

# env.step(2)

In [6]:
print(env.observation_space)
env.action_space

Discrete(16)
Out[6]:

Discrete(4)

In [7]:
env.P[2][2]
Out[7]:
[(1.0, 3, 0.0, False)]

In [8]:

def value_iteration(env, gamma = 0.9):

# initialize value table with zeros


value_table = np.zeros(env.observation_space.n)

# set number of iterations and threshold


no_of_iterations = 100000
threshold = 1e-20

for i in range(no_of_iterations):

# On each iteration, copy the value table to the updated_value_table


updated_value_table = np.copy(value_table)

# Now we calculate Q Value for each actions in the state


# and update the value of a state with maximum Q value
for state in range(env.observation_space.n):
Q_value = []
for action in range(env.action_space.n):
next_states_rewards = []
for next_sr in env.P[state][action]:
trans_prob, next_state, reward_prob, _ = next_sr
next_states_rewards.append((trans_prob * (reward_prob + gamma * upda
ted_value_table[next_state])))

Q_value.append(np.sum(next_states_rewards))

value_table[state] = max(Q_value)

# we will check whether we have reached the convergence i.e whether the differenc
e
# between our value table and updated value table is very small. But how do we kn
ow it is very
# small? We set some threshold and then we will see if the difference is less
# than our threshold, if it is less, we break the loop and return the value funct
ion as optimal
# value function

if (np.sum(np.fabs(updated_value_table - value_table)) <= threshold):


print ('Value-iteration converged at iteration# %d .' %(i+1))
break

return value_table

In [9]:
def extract_policy(value_table, gamma = 0.9):

# initialize the policy with zeros


policy = np.zeros(env.observation_space.n)

for state in range(env.observation_space.n):

# initialize the Q table for a state


Q_table = np.zeros(env.action_space.n)

# compute Q value for all ations in the state


for action in range(env.action_space.n):
for next_sr in env.P[state][action]:
trans_prob, next_state, reward_prob, _ = next_sr
Q_table[action] += (trans_prob * (reward_prob + gamma * value_table[next
_state]))

# select the action which has maximum Q value as an optimal action of the state
policy[state] = np.argmax(Q_table)

return policy

In [10]:
optimal_value_function = value_iteration(env=env,gamma=0.9)
optimal_value_function

Value-iteration converged at iteration# 7.


Out[10]:
array([0.59049, 0.6561 , 0.729 , 0.6561 , 0.6561 , 0. , 0.81 ,
0. , 0.729 , 0.81 , 0.9 , 0. , 0. , 0.9 ,
1. , 0. ])

In [11]:
optimal_policy = extract_policy(optimal_value_function, gamma=0.9)
optimal_policy
Out[11]:
Out[11]:

array([1., 2., 1., 0., 1., 0., 1., 0., 2., 1., 1., 0., 0., 2., 2., 0.])

In [12]:
opt_pol = optimal_policy.reshape(4,4)
print('THE OPTIMAL POLICY IS \n ',opt_pol)

THE OPTIMAL POLICY IS


[[1. 2. 1. 0.]
[1. 0. 1. 0.]
[2. 1. 1. 0.]
[0. 2. 2. 0.]]

You might also like