sritee/PG.py

## PG.py
#improved GIST of https://gist.github.com/MikeOuimet/f50904374651c9c15f236aef3109d57
#allowed backwards compatability by refactoring code involving tf.diag gradients

import numpy as np
import gym
import tensorflow as tf
import matplotlib.pyplot as plt

def weight_variable(shape):
  initial = tf.truncated_normal(shape, stddev=0.1)
  return tf.Variable(initial)

def bias_variable(shape):
  initial = tf.constant(0.2, shape=shape)
  return tf.Variable(initial)


#Initial state and NN
env = gym.make('CartPole-v0')
env._max_episode_steps = 200
#env.monitor.start('/tmp/cartpole-experiment-1', force=True)

dim = max(np.shape(env.observation_space))
dim_actions = env.action_space.n

num_nodes = 100

num_gradients = 1
maxsteps = 500
num_runs = 1000

sess = tf.InteractiveSession()

state = tf.placeholder(tf.float32, shape=[None, dim])
action_choice = tf.placeholder(tf.int32, shape=[None])
reward_signal = tf.placeholder(tf.float32, shape=(None,1) )
n_timesteps = tf.placeholder(tf.float32, shape=())


W1 = weight_variable([dim, num_nodes])
b1 = bias_variable([num_nodes])
a1 = tf.nn.relu(tf.matmul(state, W1) + b1)

Wo = weight_variable([num_nodes, dim_actions])
bo = bias_variable([dim_actions])
ao = tf.nn.softmax(tf.matmul(a1, Wo) + bo)
ao_flat=tf.reshape(ao,(-1,1))

chosen_actions=tf.range(0,tf.shape(ao)[0])*tf.shape(ao)[1]+action_choice
log_prob=tf.log(tf.gather(ao_flat,chosen_actions))
loss = tf.multiply(log_prob, reward_signal)
loss = -tf.reshape(loss, [-1])
train_step = tf.train.AdamOptimizer().minimize(loss)
init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)


timestep_learning = np.zeros((num_runs,1))
for run in range(num_runs):

    states = np.zeros((maxsteps,dim), dtype='float32')
    actions = np.zeros((maxsteps,1), dtype='float32')
    rewards = np.zeros((maxsteps,1), dtype='float32')
    timestep =0
    observation = env.reset()
    observation = np.reshape(observation,(1,dim))
    done = False

    while not done and timestep < maxsteps:
        if run % 50 == 0:
            env.render()
        action_prob = sess.run(ao, feed_dict={state: observation})
        action = np.argmax(np.random.multinomial(1, action_prob[0]))
        new_observation, reward, done, info = env.step(action)

        states[timestep, :] = observation

        actions[timestep] = action
        rewards[timestep, :] = reward
        timestep += 1

        observation[:] = new_observation

    states = states[:timestep, :]
    actions = actions[:timestep, :]
    rewards = rewards[:timestep,:]
    rewards[:, 0] = np.cumsum(rewards[::-1])[::-1]
    #break

    if run % 50 == 0:
        print ('run #: ', run)
        print ('Time lasted: ', timestep)
        #print(rewards)
    for i in range(num_gradients):
        sess.run(train_step, feed_dict={state: states, action_choice: actions.flatten(), reward_signal: rewards, n_timesteps: timestep})
    timestep_learning[run] = timestep

env.monitor.close()
env.render(close=True)
plt.plot(timestep_learning)
plt.show()
	#improved GIST of https://gist.github.com/MikeOuimet/f50904374651c9c15f236aef3109d57
	#allowed backwards compatability by refactoring code involving tf.diag gradients

	import numpy as np
	import gym
	import tensorflow as tf
	import matplotlib.pyplot as plt

	def weight_variable(shape):
	initial = tf.truncated_normal(shape, stddev=0.1)
	return tf.Variable(initial)

	def bias_variable(shape):
	initial = tf.constant(0.2, shape=shape)
	return tf.Variable(initial)


	#Initial state and NN
	env = gym.make('CartPole-v0')
	env._max_episode_steps = 200
	#env.monitor.start('/tmp/cartpole-experiment-1', force=True)

	dim = max(np.shape(env.observation_space))
	dim_actions = env.action_space.n

	num_nodes = 100

	num_gradients = 1
	maxsteps = 500
	num_runs = 1000

	sess = tf.InteractiveSession()

	state = tf.placeholder(tf.float32, shape=[None, dim])
	action_choice = tf.placeholder(tf.int32, shape=[None])
	reward_signal = tf.placeholder(tf.float32, shape=(None,1) )
	n_timesteps = tf.placeholder(tf.float32, shape=())


	W1 = weight_variable([dim, num_nodes])
	b1 = bias_variable([num_nodes])
	a1 = tf.nn.relu(tf.matmul(state, W1) + b1)

	Wo = weight_variable([num_nodes, dim_actions])
	bo = bias_variable([dim_actions])
	ao = tf.nn.softmax(tf.matmul(a1, Wo) + bo)
	ao_flat=tf.reshape(ao,(-1,1))

	chosen_actions=tf.range(0,tf.shape(ao)[0])*tf.shape(ao)[1]+action_choice
	log_prob=tf.log(tf.gather(ao_flat,chosen_actions))
	loss = tf.multiply(log_prob, reward_signal)
	loss = -tf.reshape(loss, [-1])
	train_step = tf.train.AdamOptimizer().minimize(loss)
	init = tf.initialize_all_variables()
	sess = tf.Session()
	sess.run(init)


	timestep_learning = np.zeros((num_runs,1))
	for run in range(num_runs):

	states = np.zeros((maxsteps,dim), dtype='float32')
	actions = np.zeros((maxsteps,1), dtype='float32')
	rewards = np.zeros((maxsteps,1), dtype='float32')
	timestep =0
	observation = env.reset()
	observation = np.reshape(observation,(1,dim))
	done = False

	while not done and timestep < maxsteps:
	if run % 50 == 0:
	env.render()
	action_prob = sess.run(ao, feed_dict={state: observation})
	action = np.argmax(np.random.multinomial(1, action_prob[0]))
	new_observation, reward, done, info = env.step(action)

	states[timestep, :] = observation

	actions[timestep] = action
	rewards[timestep, :] = reward
	timestep += 1

	observation[:] = new_observation

	states = states[:timestep, :]
	actions = actions[:timestep, :]
	rewards = rewards[:timestep,:]
	rewards[:, 0] = np.cumsum(rewards[::-1])[::-1]
	#break

	if run % 50 == 0:
	print ('run #: ', run)
	print ('Time lasted: ', timestep)
	#print(rewards)
	for i in range(num_gradients):
	sess.run(train_step, feed_dict={state: states, action_choice: actions.flatten(), reward_signal: rewards, n_timesteps: timestep})
	timestep_learning[run] = timestep

	env.monitor.close()
	env.render(close=True)
	plt.plot(timestep_learning)
	plt.show()
No results found