qgallouedec/GLIE_loop.py

## GLIE_loop.py
# initilise Q and N
Q_pi = np.ones((nb_states, nb_actions))
N = np.zeros_like(Q_pi)

for i_generation in range(nb_generation):
    # Define π' = ε-greedy policy w.r.t Q_π
    pi = epsilon_greedy_policy(Q_pi, eps)

    # Generate one episode by following the π'
    states, actions, rewards = generate_episode(env, pi)

    # From this episode, get a rought approximation of Q_π'.
    for t in range(len(states)):
        # Compute the dicounted reward Gt from time t
        # Gt = rewards[t] + gamma*Gt
        Gt = compute_gain(rewards, t, gamma)

        # \delta_t = G_t - Q(S_t, A_t)
        delta_t = Gt - Q_pi[states[t]][actions[t]]

        # Add pair state-action to the counter
        N[states[t]][actions[t]] += 1

        # Add delta_t to the current value function
        # Q(S_t, A_t) += \frac{\delta_t}{N(S_t, A_t)}
        Q_pi[states[t]][actions[t]] += delta_t/N[states[t]][actions[t]]

    eps*=0.99
	# initilise Q and N
	Q_pi = np.ones((nb_states, nb_actions))
	N = np.zeros_like(Q_pi)

	for i_generation in range(nb_generation):
	# Define π' = ε-greedy policy w.r.t Q_π
	pi = epsilon_greedy_policy(Q_pi, eps)

	# Generate one episode by following the π'
	states, actions, rewards = generate_episode(env, pi)

	# From this episode, get a rought approximation of Q_π'.
	for t in range(len(states)):
	# Compute the dicounted reward Gt from time t
	# Gt = rewards[t] + gamma*Gt
	Gt = compute_gain(rewards, t, gamma)

	# \delta_t = G_t - Q(S_t, A_t)
	delta_t = Gt - Q_pi[states[t]][actions[t]]

	# Add pair state-action to the counter
	N[states[t]][actions[t]] += 1

	# Add delta_t to the current value function
	# Q(S_t, A_t) += \frac{\delta_t}{N(S_t, A_t)}
	Q_pi[states[t]][actions[t]] += delta_t/N[states[t]][actions[t]]

	eps*=0.99
No results found