anselmobattisti/suttonebarto_example35.py

## suttonebarto_example35.py
# Sutton e Barto
# Example 3.5: Gridworld Figure 3.2 (left) shows a rectangular
# gridworld representation of a simple finite MDP. The cells
# of the grid correspond to the states of the environment.
# At each cell, four actions are possible: north, south,
# east, and west, which deterministically cause the agent to
# move one cell in the respective direction on the grid.
# Actions that would take the agent off the grid leave its
# location unchanged, but also result in a reward of −1.
# Other actions result in a reward of 0, except those that
# move the agent out of the special states A and B.
# From state A, all four actions yield a reward of +10
# and take the agent to A′. From state B, all actions yield a
# reward of +5 and take the agent to B′.
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

class Gridworld:

    NORTH = 0
    SOUTH = 1
    EAST = 2
    WEST = 3

    def __init__(self):
        self.rows = 5
        self.cols = 5
        self.state = (0, 0)  # Starting state
        self.A = (0, 1)
        self.A_prime = (4, 1)
        self.B = (0, 3)
        self.B_prime = (2, 3)

    def step(self, action):
        """Execute action and return reward and new state."""

        i, j = self.state
        if self.state == self.A:
            self.state = self.A_prime
            return self.state, 10

        elif self.state == self.B:
            self.state = self.B_prime
            return self.state, 5

        if action == self.NORTH:
            new_r, new_c = max(i - 1, 0), j
        elif action == self.SOUTH:
            new_r, new_c = min(i + 1, self.rows - 1), j
        elif action == self.EAST:
            new_r, new_c = i, min(j + 1, self.cols - 1)
        elif action == self.WEST:
            new_r, new_c = i, max(j - 1, 0)
        else:
            raise ValueError("Invalid action")

        if (new_r, new_c) == (i, j):
            reward = -1
        else:
            reward = 0

        self.state = (new_r, new_c)
        return self.state, reward

    def reset(self):
        """Reset the environment to the starting state."""
        self.state = (0, 0)
        return self.state

    def reward_grid(self):
        """Build reward grid."""
        grid = np.zeros((self.rows, self.cols))
        grid[self.A] = 10
        grid[self.B] = 5
        return grid

class Solver:

    def __init__(self, env, action_probability=0.25, gama=1.0):
        self.env = env
        self.rows = env.rows
        self.cols = env.cols
        self.action_probability = action_probability
        self.gama = gama
        self.actions = [env.NORTH, env.SOUTH, env.EAST, env.WEST]
        if gama >= 1.0:
            print("Warning: gama>=1.0 may lead to non-convergent value function.")
            exit()

    def to_index(self, i,j):
        """Convert 2D grid position to 1D index."""
        return i * self.cols + j

    def bellman(self):
        """Compute the value function using Bellman equation."""
        n_probabilities = self.rows * self.cols
        P = np.zeros((n_probabilities, n_probabilities))
        reward_vector = np.zeros(n_probabilities)
        for i in range(self.rows):
            for j in range(self.cols):
                s = self.to_index(i, j)
                for action in self.actions:
                    # Evaluate transitions assuming we start from (i, j)
                    self.env.state = (i, j)
                    s_prime, rew = self.env.step(action)
                    s_prime_index = self.to_index(s_prime[0], s_prime[1])

                    # If the action back to the same state, it means it hit the wall
                    # And the probability of staying in the same state is accumulated
                    # For example:
                    # If im in state (0,0) and go WEST, I stay in (0,0) with prob 0.25
                    # If im in state (0,0) and go NORTH, I stay in (0,0) with prob 0.25
                    # So the total prob of terminate in (0,0) after execute all the actions is 0.5
                    P[s, s_prime_index] += self.action_probability

                    # Accumulate expected reward
                    # The reward is also weighted by the action probability
                    # For example:
                    # If im in state (0,0) and go WEST, I get reward -1 with prob 0.25
                    # If im in state (0,0) and go NORTH, I get reward -1 with prob 0.25
                    # If im in state (0,0) and go EAST, I get reward 0 with prob 0.25
                    # If im in state (0,0) and go SOUTH, I get reward 0 with prob 0.25
                    # So the total expected reward in (0,0) after execute all the actions is:
                    # (-1 * 0.25) + (-1 * 0.25) + (0 * 0.25) + (0 * 0.25) = -0.5
                    # reward_vector[0] = -0.5
                    reward_vector[s] += rew * self.action_probability

        v = np.linalg.solve(np.eye(n_probabilities) - self.gama*P, reward_vector)
        return v.reshape((self.rows, self.cols))

    def show(self, v=None):
        """Print the value function in grid format."""
        if v is None:
            v = self.bellman()
        print(v)
        plt.figure(figsize=(6, 5))
        sns.heatmap(
            v,                 # matriz (rows x cols)
            annot=True,             # mostra valores
            fmt=".2f",              # 2 casas decimais
            cmap="Blues",
            linewidths=0.5,
            cbar=True
        )
        plt.title("Valor de vπ para política aleatória")
        plt.tight_layout()
        plt.show()

if __name__ == "__main__":
    env = Gridworld()
    total_reward = 0

    solver = Solver(env, 0.25, 0.5)
    v = solver.bellman()
    solver.show(v)
	# Sutton e Barto
	# Example 3.5: Gridworld Figure 3.2 (left) shows a rectangular
	# gridworld representation of a simple finite MDP. The cells
	# of the grid correspond to the states of the environment.
	# At each cell, four actions are possible: north, south,
	# east, and west, which deterministically cause the agent to
	# move one cell in the respective direction on the grid.
	# Actions that would take the agent off the grid leave its
	# location unchanged, but also result in a reward of −1.
	# Other actions result in a reward of 0, except those that
	# move the agent out of the special states A and B.
	# From state A, all four actions yield a reward of +10
	# and take the agent to A′. From state B, all actions yield a
	# reward of +5 and take the agent to B′.
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns

	class Gridworld:

	NORTH = 0
	SOUTH = 1
	EAST = 2
	WEST = 3

	def __init__(self):
	self.rows = 5
	self.cols = 5
	self.state = (0, 0) # Starting state
	self.A = (0, 1)
	self.A_prime = (4, 1)
	self.B = (0, 3)
	self.B_prime = (2, 3)

	def step(self, action):
	"""Execute action and return reward and new state."""

	i, j = self.state
	if self.state == self.A:
	self.state = self.A_prime
	return self.state, 10

	elif self.state == self.B:
	self.state = self.B_prime
	return self.state, 5

	if action == self.NORTH:
	new_r, new_c = max(i - 1, 0), j
	elif action == self.SOUTH:
	new_r, new_c = min(i + 1, self.rows - 1), j
	elif action == self.EAST:
	new_r, new_c = i, min(j + 1, self.cols - 1)
	elif action == self.WEST:
	new_r, new_c = i, max(j - 1, 0)
	else:
	raise ValueError("Invalid action")

	if (new_r, new_c) == (i, j):
	reward = -1
	else:
	reward = 0

	self.state = (new_r, new_c)
	return self.state, reward

	def reset(self):
	"""Reset the environment to the starting state."""
	self.state = (0, 0)
	return self.state

	def reward_grid(self):
	"""Build reward grid."""
	grid = np.zeros((self.rows, self.cols))
	grid[self.A] = 10
	grid[self.B] = 5
	return grid

	class Solver:

	def __init__(self, env, action_probability=0.25, gama=1.0):
	self.env = env
	self.rows = env.rows
	self.cols = env.cols
	self.action_probability = action_probability
	self.gama = gama
	self.actions = [env.NORTH, env.SOUTH, env.EAST, env.WEST]
	if gama >= 1.0:
	print("Warning: gama>=1.0 may lead to non-convergent value function.")
	exit()

	def to_index(self, i,j):
	"""Convert 2D grid position to 1D index."""
	return i * self.cols + j

	def bellman(self):
	"""Compute the value function using Bellman equation."""
	n_probabilities = self.rows * self.cols
	P = np.zeros((n_probabilities, n_probabilities))
	reward_vector = np.zeros(n_probabilities)
	for i in range(self.rows):
	for j in range(self.cols):
	s = self.to_index(i, j)
	for action in self.actions:
	# Evaluate transitions assuming we start from (i, j)
	self.env.state = (i, j)
	s_prime, rew = self.env.step(action)
	s_prime_index = self.to_index(s_prime[0], s_prime[1])

	# If the action back to the same state, it means it hit the wall
	# And the probability of staying in the same state is accumulated
	# For example:
	# If im in state (0,0) and go WEST, I stay in (0,0) with prob 0.25
	# If im in state (0,0) and go NORTH, I stay in (0,0) with prob 0.25
	# So the total prob of terminate in (0,0) after execute all the actions is 0.5
	P[s, s_prime_index] += self.action_probability

	# Accumulate expected reward
	# The reward is also weighted by the action probability
	# For example:
	# If im in state (0,0) and go WEST, I get reward -1 with prob 0.25
	# If im in state (0,0) and go NORTH, I get reward -1 with prob 0.25
	# If im in state (0,0) and go EAST, I get reward 0 with prob 0.25
	# If im in state (0,0) and go SOUTH, I get reward 0 with prob 0.25
	# So the total expected reward in (0,0) after execute all the actions is:
	# (-1 * 0.25) + (-1 * 0.25) + (0 * 0.25) + (0 * 0.25) = -0.5
	# reward_vector[0] = -0.5
	reward_vector[s] += rew * self.action_probability

	v = np.linalg.solve(np.eye(n_probabilities) - self.gama*P, reward_vector)
	return v.reshape((self.rows, self.cols))

	def show(self, v=None):
	"""Print the value function in grid format."""
	if v is None:
	v = self.bellman()
	print(v)
	plt.figure(figsize=(6, 5))
	sns.heatmap(
	v, # matriz (rows x cols)
	annot=True, # mostra valores
	fmt=".2f", # 2 casas decimais
	cmap="Blues",
	linewidths=0.5,
	cbar=True
	)
	plt.title("Valor de vπ para política aleatória")
	plt.tight_layout()
	plt.show()

	if __name__ == "__main__":
	env = Gridworld()
	total_reward = 0

	solver = Solver(env, 0.25, 0.5)
	v = solver.bellman()
	solver.show(v)
No results found