Open AI gym lunar lander Genetic algorithm.
https://gym.openai.com/evaluations/eval_FbKq5MxAS9GlvB7W6ioJkg
Open AI gym lunar lander Genetic algorithm.
https://gym.openai.com/evaluations/eval_FbKq5MxAS9GlvB7W6ioJkg
| # neat-python configuration for the LunarLander-v2 environment on OpenAI Gym | |
| [NEAT] | |
| pop_size = 150 | |
| # Note: the fitness threshold will never be reached because | |
| # we are controlling the termination ourselves based on simulation performance. | |
| fitness_criterion = max | |
| fitness_threshold = 1000.0 | |
| reset_on_extinction = 0 | |
| [DefaultGenome] | |
| num_inputs = 8 | |
| num_hidden = 0 | |
| num_outputs = 4 | |
| initial_connection = full | |
| feed_forward = True | |
| compatibility_disjoint_coefficient = 1.0 | |
| compatibility_weight_coefficient = 1.0 | |
| conn_add_prob = 0.15 | |
| conn_delete_prob = 0.1 | |
| node_add_prob = 0.15 | |
| node_delete_prob = 0.1 | |
| activation_default = clamped | |
| activation_options = clamped | |
| activation_mutate_rate = 0.0 | |
| aggregation_default = sum | |
| aggregation_options = sum | |
| aggregation_mutate_rate = 0.0 | |
| bias_init_mean = 0.0 | |
| bias_init_stdev = 1.0 | |
| bias_replace_rate = 0.02 | |
| bias_mutate_rate = 0.8 | |
| bias_mutate_power = 0.4 | |
| bias_max_value = 30.0 | |
| bias_min_value = -30.0 | |
| response_init_mean = 1.0 | |
| response_init_stdev = 0.0 | |
| response_replace_rate = 0.0 | |
| response_mutate_rate = 0.1 | |
| response_mutate_power = 0.01 | |
| response_max_value = 30.0 | |
| response_min_value = -30.0 | |
| weight_max_value = 30 | |
| weight_min_value = -30 | |
| weight_init_mean = 0.0 | |
| weight_init_stdev = 1.0 | |
| weight_mutate_rate = 0.8 | |
| weight_replace_rate = 0.02 | |
| weight_mutate_power = 0.4 | |
| enabled_default = True | |
| enabled_mutate_rate = 0.01 | |
| [DefaultSpeciesSet] | |
| compatibility_threshold = 3.0 | |
| [DefaultStagnation] | |
| species_fitness_func = mean | |
| max_stagnation = 15 | |
| species_elitism = 4 | |
| [DefaultReproduction] | |
| elitism = 2 | |
| survival_threshold = 0.2 | |
| # Evolve a control/reward estimation network for the OpenAI Gym | |
| # LunarLander-v2 environment (https://gym.openai.com/envs/LunarLander-v2). | |
| # Sample run here: https://gym.openai.com/evaluations/eval_FbKq5MxAS9GlvB7W6ioJkg | |
| from __future__ import print_function | |
| import gym | |
| import gym.wrappers | |
| import matplotlib.pyplot as plt | |
| import multiprocessing | |
| import neat | |
| import numpy as np | |
| import os | |
| import pickle | |
| import random | |
| import time | |
| import visualize | |
| env = gym.make('LunarLander-v2') | |
| print("action space: {0!r}".format(env.action_space)) | |
| print("observation space: {0!r}".format(env.observation_space)) | |
| # Limit episode time steps to cut down on training time. | |
| # 400 steps is more than enough time to land with a winning score. | |
| print(env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps')) | |
| env.spec.tags['wrapper_config.TimeLimit.max_episode_steps'] = 400 | |
| print(env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps')) | |
| env = gym.wrappers.Monitor(env, 'results', force=True) | |
| discounted_reward = 0.9 | |
| min_reward = -200 | |
| max_reward = 200 | |
| score_range = [] | |
| def compute_fitness(net, discounted_rewards, episodes): | |
| reward_error = [] | |
| for discount_reward, episode in zip(discounted_rewards, episodes): | |
| for (j, observation, action, reward), dr in zip(episode, discount_reward): | |
| output = net.activate(observation) | |
| reward_error.append(float((output[action] - dr) ** 2)) | |
| return reward_error | |
| class PooledErrorCompute(object): | |
| def __init__(self): | |
| self.pool = multiprocessing.Pool() | |
| def evaluate_genomes(self, genomes, config): | |
| t0 = time.time() | |
| nets = [] | |
| for gid, g in genomes: | |
| nets.append((g, neat.nn.FeedForwardNetwork.create(g, config))) | |
| g.fitness = [] | |
| print("network creation time {0}".format(time.time() - t0)) | |
| t0 = time.time() | |
| episodes = [] | |
| for genome, net in nets: | |
| observation = env.reset() | |
| episode_data = [] | |
| j = 0 | |
| total_score = 0.0 | |
| while 1: | |
| if net is not None: | |
| output = net.activate(observation) | |
| action = np.argmax(output) | |
| else: | |
| action = env.action_space.sample() | |
| observation, reward, done, info = env.step(action) | |
| total_score += reward | |
| episode_data.append((j, observation, action, reward)) | |
| if done: | |
| break | |
| j += 1 | |
| episodes.append((total_score, episode_data)) | |
| genome.fitness = total_score | |
| print("simulation run time {0}".format(time.time() - t0)) | |
| t0 = time.time() | |
| scores = [s for s, e in episodes] | |
| score_range.append((min(scores), np.mean(scores), max(scores))) | |
| # Compute discounted rewards. | |
| discounted_rewards = [] | |
| for score, episode in episodes: | |
| rewards = np.array([reward for j, observation, action, reward in episode]) | |
| N = len(episode) | |
| D = np.sum((np.eye(N, k=i) * discounted_reward ** i for i in range(N))) | |
| discounted_rewards.append(np.dot(D, rewards)) | |
| print(min(map(np.min, discounted_rewards)), max(map(np.max, discounted_rewards))) | |
| # Normalize rewards | |
| for i in range(len(discounted_rewards)): | |
| discounted_rewards[i] = 2 * (discounted_rewards[i] - min_reward) / (max_reward - min_reward) - 1.0 | |
| print(min(map(np.min, discounted_rewards)), max(map(np.max, discounted_rewards))) | |
| print("discounted & normalized reward compute time {0}".format(time.time() - t0)) | |
| t0 = time.time() | |
| # Randomly choose subset of episodes for evaluation of genome reward estimation. | |
| comparison_episodes = [random.choice(episodes)[1] for _ in range(10)] | |
| jobs = [] | |
| for genome, net in nets: | |
| jobs.append(self.pool.apply_async(compute_fitness, (net, discounted_rewards, comparison_episodes))) | |
| # Assign a composite fitness to each genome; genomes can make progress either | |
| # by improving their total reward or by making more accurate reward estimates. | |
| for job, (genome_id, genome) in zip(jobs, genomes): | |
| reward_error = job.get(timeout=None) | |
| genome.fitness -= 150 * np.mean(reward_error) | |
| print("final fitness compute time {0}\n".format(time.time() - t0)) | |
| def run(): | |
| # Load the config file, which is assumed to live in | |
| # the same directory as this script. | |
| local_dir = os.path.dirname(__file__) | |
| config_path = os.path.join(local_dir, 'config') | |
| config = neat.Config(neat.DefaultGenome, neat.DefaultReproduction, | |
| neat.DefaultSpeciesSet, neat.DefaultStagnation, | |
| config_path) | |
| pop = neat.Population(config) | |
| stats = neat.StatisticsReporter() | |
| pop.add_reporter(stats) | |
| pop.add_reporter(neat.StdOutReporter(True)) | |
| # Checkpoint every 10 generations or 900 seconds. | |
| pop.add_reporter(neat.Checkpointer(10, 900)) | |
| # Run until the winner from a generation is able to solve the environment | |
| # or the user interrupts the process. | |
| ec = PooledErrorCompute() | |
| while 1: | |
| try: | |
| pop.run(ec.evaluate_genomes, 1) | |
| visualize.plot_stats(stats, ylog=False, view=False, filename="fitness.svg") | |
| if score_range: | |
| S = np.array(score_range).T | |
| plt.plot(S[0], 'r-') | |
| plt.plot(S[1], 'b-') | |
| plt.plot(S[2], 'g-') | |
| plt.grid() | |
| plt.savefig("score-ranges.svg") | |
| plt.close() | |
| mfs = sum(stats.get_fitness_mean()[-5:]) / 5.0 | |
| print("Average mean fitness over last 5 generations: {0}".format(mfs)) | |
| mfs = sum(stats.get_fitness_stat(min)[-5:]) / 5.0 | |
| print("Average min fitness over last 5 generations: {0}".format(mfs)) | |
| # Use the five best genomes seen so far as an ensemble-ish control system. | |
| best_genomes = stats.best_unique_genomes(5) | |
| best_networks = [] | |
| for g in best_genomes: | |
| best_networks.append(neat.nn.FeedForwardNetwork.create(g, config)) | |
| solved = True | |
| best_scores = [] | |
| for k in range(100): | |
| observation = env.reset() | |
| score = 0 | |
| while 1: | |
| # Use the total reward estimates from all five networks to | |
| # determine the best action given the current state. | |
| total_rewards = np.zeros((4,)) | |
| for n in best_networks: | |
| output = n.activate(observation) | |
| total_rewards += output | |
| best_action = np.argmax(total_rewards) | |
| observation, reward, done, info = env.step(best_action) | |
| score += reward | |
| env.render() | |
| if done: | |
| break | |
| best_scores.append(score) | |
| avg_score = sum(best_scores) / len(best_scores) | |
| print(k, score, avg_score) | |
| if avg_score < 200: | |
| solved = False | |
| break | |
| if solved: | |
| print("Solved.") | |
| # Save the winners. | |
| for n, g in enumerate(best_genomes): | |
| name = 'winner-{0}'.format(n) | |
| with open(name+'.pickle', 'wb') as f: | |
| pickle.dump(g, f) | |
| visualize.draw_net(config, g, view=False, filename=name + "-net.gv") | |
| visualize.draw_net(config, g, view=False, filename="-net-enabled.gv", | |
| show_disabled=False) | |
| visualize.draw_net(config, g, view=False, filename="-net-enabled-pruned.gv", | |
| show_disabled=False, prune_unused=True) | |
| break | |
| except KeyboardInterrupt: | |
| print("User break.") | |
| break | |
| env.close() | |
| if __name__ == '__main__': | |
| run() |