Created
March 8, 2026 00:54
-
-
Save evanthebouncy/e940a6fd1a9cabe7a63748ce7d457172 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import random | |
| from time import time | |
| from sklearn.neighbors import KNeighborsClassifier | |
| import numpy as np | |
| from mlagents_envs.environment import UnityEnvironment | |
| from mlagents_envs.envs.unity_gym_env import UnityToGymWrapper | |
| unity_env = UnityEnvironment("gg_det") | |
| env = UnityToGymWrapper(unity_env) | |
| def get_reduced_observation(state): | |
| return state[0:2] # Reduce to first two elements | |
| def nn_act(nn_model, state): | |
| if (nn_model is not None) and (random.random() < 0.8): | |
| return nn_model.predict([get_reduced_observation(state)])[0] | |
| return env.action_space.sample() | |
| def nn_fit(replay_buffer): | |
| nn_model = KNeighborsClassifier(n_neighbors=5) | |
| if len(replay_buffer) > 10: | |
| # sort by top 10 percent of performance | |
| replay_buffer.sort(key=lambda x: x[1], reverse=True) | |
| top_10_percent = replay_buffer[:len(replay_buffer) // 10] | |
| X = [] | |
| y = [] | |
| # break the trajectory into (s,a) pairs, s goes to X, a goes to y, ignore the adjusted rewards | |
| for roll_out, _ in top_10_percent: | |
| for step in roll_out: | |
| obs, action, reward = step | |
| X.append(get_reduced_observation(obs)) | |
| y.append(action) | |
| nn_model.fit(np.array(X), np.array(y)) | |
| return nn_model | |
| return None | |
| nn_model = None | |
| replay_buffer = [] | |
| for i in range(10000): | |
| print(f'Buffer size: {len(replay_buffer)}') | |
| # print adjusted_reward of the last 10 things in the buffer | |
| print(f'Adjusted Reward: {[x[1] for x in replay_buffer[-10:]]}') | |
| # collect experience | |
| for j in range(10): | |
| obs = env.reset() | |
| done = False | |
| roll_out = [] | |
| while not done: | |
| action = nn_act(nn_model, obs) | |
| obs, reward, done, info = env.step(action) | |
| roll_out.append((obs, action, reward)) | |
| # reward for making the opponent move further from center, at location 2 and 3 | |
| def get_opponent_distance_from_center(obs): | |
| opponent_xy = obs[2:4] | |
| return np.linalg.norm(opponent_xy) | |
| # reward on entire trajectory on the furthest we can make the opponent | |
| adjusted_reward = max(get_opponent_distance_from_center(step[0]) for step in roll_out) | |
| replay_buffer.append((roll_out, adjusted_reward)) | |
| # fit model | |
| nn_model = nn_fit(replay_buffer) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment