Skip to content

Instantly share code, notes, and snippets.

@anselmobattisti
Created December 20, 2025 14:22
Show Gist options
  • Select an option

  • Save anselmobattisti/fbaccdf3ea4f16f3600b5ac5e6082d50 to your computer and use it in GitHub Desktop.

Select an option

Save anselmobattisti/fbaccdf3ea4f16f3600b5ac5e6082d50 to your computer and use it in GitHub Desktop.
# Sutton e Barto
# Example 3.5: Gridworld Figure 3.2 (left) shows a rectangular
# gridworld representation of a simple finite MDP. The cells
# of the grid correspond to the states of the environment.
# At each cell, four actions are possible: north, south,
# east, and west, which deterministically cause the agent to
# move one cell in the respective direction on the grid.
# Actions that would take the agent off the grid leave its
# location unchanged, but also result in a reward of −1.
# Other actions result in a reward of 0, except those that
# move the agent out of the special states A and B.
# From state A, all four actions yield a reward of +10
# and take the agent to A′. From state B, all actions yield a
# reward of +5 and take the agent to B′.
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
class Gridworld:
NORTH = 0
SOUTH = 1
EAST = 2
WEST = 3
def __init__(self):
self.rows = 5
self.cols = 5
self.state = (0, 0) # Starting state
self.A = (0, 1)
self.A_prime = (4, 1)
self.B = (0, 3)
self.B_prime = (2, 3)
def step(self, action):
"""Execute action and return reward and new state."""
i, j = self.state
if self.state == self.A:
self.state = self.A_prime
return self.state, 10
elif self.state == self.B:
self.state = self.B_prime
return self.state, 5
if action == self.NORTH:
new_r, new_c = max(i - 1, 0), j
elif action == self.SOUTH:
new_r, new_c = min(i + 1, self.rows - 1), j
elif action == self.EAST:
new_r, new_c = i, min(j + 1, self.cols - 1)
elif action == self.WEST:
new_r, new_c = i, max(j - 1, 0)
else:
raise ValueError("Invalid action")
if (new_r, new_c) == (i, j):
reward = -1
else:
reward = 0
self.state = (new_r, new_c)
return self.state, reward
def reset(self):
"""Reset the environment to the starting state."""
self.state = (0, 0)
return self.state
def reward_grid(self):
"""Build reward grid."""
grid = np.zeros((self.rows, self.cols))
grid[self.A] = 10
grid[self.B] = 5
return grid
class Solver:
def __init__(self, env, action_probability=0.25, gama=1.0):
self.env = env
self.rows = env.rows
self.cols = env.cols
self.action_probability = action_probability
self.gama = gama
self.actions = [env.NORTH, env.SOUTH, env.EAST, env.WEST]
if gama >= 1.0:
print("Warning: gama>=1.0 may lead to non-convergent value function.")
exit()
def to_index(self, i,j):
"""Convert 2D grid position to 1D index."""
return i * self.cols + j
def bellman(self):
"""Compute the value function using Bellman equation."""
n_probabilities = self.rows * self.cols
P = np.zeros((n_probabilities, n_probabilities))
reward_vector = np.zeros(n_probabilities)
for i in range(self.rows):
for j in range(self.cols):
s = self.to_index(i, j)
for action in self.actions:
# Evaluate transitions assuming we start from (i, j)
self.env.state = (i, j)
s_prime, rew = self.env.step(action)
s_prime_index = self.to_index(s_prime[0], s_prime[1])
# If the action back to the same state, it means it hit the wall
# And the probability of staying in the same state is accumulated
# For example:
# If im in state (0,0) and go WEST, I stay in (0,0) with prob 0.25
# If im in state (0,0) and go NORTH, I stay in (0,0) with prob 0.25
# So the total prob of terminate in (0,0) after execute all the actions is 0.5
P[s, s_prime_index] += self.action_probability
# Accumulate expected reward
# The reward is also weighted by the action probability
# For example:
# If im in state (0,0) and go WEST, I get reward -1 with prob 0.25
# If im in state (0,0) and go NORTH, I get reward -1 with prob 0.25
# If im in state (0,0) and go EAST, I get reward 0 with prob 0.25
# If im in state (0,0) and go SOUTH, I get reward 0 with prob 0.25
# So the total expected reward in (0,0) after execute all the actions is:
# (-1 * 0.25) + (-1 * 0.25) + (0 * 0.25) + (0 * 0.25) = -0.5
# reward_vector[0] = -0.5
reward_vector[s] += rew * self.action_probability
v = np.linalg.solve(np.eye(n_probabilities) - self.gama*P, reward_vector)
return v.reshape((self.rows, self.cols))
def show(self, v=None):
"""Print the value function in grid format."""
if v is None:
v = self.bellman()
print(v)
plt.figure(figsize=(6, 5))
sns.heatmap(
v, # matriz (rows x cols)
annot=True, # mostra valores
fmt=".2f", # 2 casas decimais
cmap="Blues",
linewidths=0.5,
cbar=True
)
plt.title("Valor de vπ para política aleatória")
plt.tight_layout()
plt.show()
if __name__ == "__main__":
env = Gridworld()
total_reward = 0
solver = Solver(env, 0.25, 0.5)
v = solver.bellman()
solver.show(v)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment