Created
December 20, 2025 14:22
-
-
Save anselmobattisti/fbaccdf3ea4f16f3600b5ac5e6082d50 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Sutton e Barto | |
| # Example 3.5: Gridworld Figure 3.2 (left) shows a rectangular | |
| # gridworld representation of a simple finite MDP. The cells | |
| # of the grid correspond to the states of the environment. | |
| # At each cell, four actions are possible: north, south, | |
| # east, and west, which deterministically cause the agent to | |
| # move one cell in the respective direction on the grid. | |
| # Actions that would take the agent off the grid leave its | |
| # location unchanged, but also result in a reward of −1. | |
| # Other actions result in a reward of 0, except those that | |
| # move the agent out of the special states A and B. | |
| # From state A, all four actions yield a reward of +10 | |
| # and take the agent to A′. From state B, all actions yield a | |
| # reward of +5 and take the agent to B′. | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| class Gridworld: | |
| NORTH = 0 | |
| SOUTH = 1 | |
| EAST = 2 | |
| WEST = 3 | |
| def __init__(self): | |
| self.rows = 5 | |
| self.cols = 5 | |
| self.state = (0, 0) # Starting state | |
| self.A = (0, 1) | |
| self.A_prime = (4, 1) | |
| self.B = (0, 3) | |
| self.B_prime = (2, 3) | |
| def step(self, action): | |
| """Execute action and return reward and new state.""" | |
| i, j = self.state | |
| if self.state == self.A: | |
| self.state = self.A_prime | |
| return self.state, 10 | |
| elif self.state == self.B: | |
| self.state = self.B_prime | |
| return self.state, 5 | |
| if action == self.NORTH: | |
| new_r, new_c = max(i - 1, 0), j | |
| elif action == self.SOUTH: | |
| new_r, new_c = min(i + 1, self.rows - 1), j | |
| elif action == self.EAST: | |
| new_r, new_c = i, min(j + 1, self.cols - 1) | |
| elif action == self.WEST: | |
| new_r, new_c = i, max(j - 1, 0) | |
| else: | |
| raise ValueError("Invalid action") | |
| if (new_r, new_c) == (i, j): | |
| reward = -1 | |
| else: | |
| reward = 0 | |
| self.state = (new_r, new_c) | |
| return self.state, reward | |
| def reset(self): | |
| """Reset the environment to the starting state.""" | |
| self.state = (0, 0) | |
| return self.state | |
| def reward_grid(self): | |
| """Build reward grid.""" | |
| grid = np.zeros((self.rows, self.cols)) | |
| grid[self.A] = 10 | |
| grid[self.B] = 5 | |
| return grid | |
| class Solver: | |
| def __init__(self, env, action_probability=0.25, gama=1.0): | |
| self.env = env | |
| self.rows = env.rows | |
| self.cols = env.cols | |
| self.action_probability = action_probability | |
| self.gama = gama | |
| self.actions = [env.NORTH, env.SOUTH, env.EAST, env.WEST] | |
| if gama >= 1.0: | |
| print("Warning: gama>=1.0 may lead to non-convergent value function.") | |
| exit() | |
| def to_index(self, i,j): | |
| """Convert 2D grid position to 1D index.""" | |
| return i * self.cols + j | |
| def bellman(self): | |
| """Compute the value function using Bellman equation.""" | |
| n_probabilities = self.rows * self.cols | |
| P = np.zeros((n_probabilities, n_probabilities)) | |
| reward_vector = np.zeros(n_probabilities) | |
| for i in range(self.rows): | |
| for j in range(self.cols): | |
| s = self.to_index(i, j) | |
| for action in self.actions: | |
| # Evaluate transitions assuming we start from (i, j) | |
| self.env.state = (i, j) | |
| s_prime, rew = self.env.step(action) | |
| s_prime_index = self.to_index(s_prime[0], s_prime[1]) | |
| # If the action back to the same state, it means it hit the wall | |
| # And the probability of staying in the same state is accumulated | |
| # For example: | |
| # If im in state (0,0) and go WEST, I stay in (0,0) with prob 0.25 | |
| # If im in state (0,0) and go NORTH, I stay in (0,0) with prob 0.25 | |
| # So the total prob of terminate in (0,0) after execute all the actions is 0.5 | |
| P[s, s_prime_index] += self.action_probability | |
| # Accumulate expected reward | |
| # The reward is also weighted by the action probability | |
| # For example: | |
| # If im in state (0,0) and go WEST, I get reward -1 with prob 0.25 | |
| # If im in state (0,0) and go NORTH, I get reward -1 with prob 0.25 | |
| # If im in state (0,0) and go EAST, I get reward 0 with prob 0.25 | |
| # If im in state (0,0) and go SOUTH, I get reward 0 with prob 0.25 | |
| # So the total expected reward in (0,0) after execute all the actions is: | |
| # (-1 * 0.25) + (-1 * 0.25) + (0 * 0.25) + (0 * 0.25) = -0.5 | |
| # reward_vector[0] = -0.5 | |
| reward_vector[s] += rew * self.action_probability | |
| v = np.linalg.solve(np.eye(n_probabilities) - self.gama*P, reward_vector) | |
| return v.reshape((self.rows, self.cols)) | |
| def show(self, v=None): | |
| """Print the value function in grid format.""" | |
| if v is None: | |
| v = self.bellman() | |
| print(v) | |
| plt.figure(figsize=(6, 5)) | |
| sns.heatmap( | |
| v, # matriz (rows x cols) | |
| annot=True, # mostra valores | |
| fmt=".2f", # 2 casas decimais | |
| cmap="Blues", | |
| linewidths=0.5, | |
| cbar=True | |
| ) | |
| plt.title("Valor de vπ para política aleatória") | |
| plt.tight_layout() | |
| plt.show() | |
| if __name__ == "__main__": | |
| env = Gridworld() | |
| total_reward = 0 | |
| solver = Solver(env, 0.25, 0.5) | |
| v = solver.bellman() | |
| solver.show(v) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment