Skip to content

Instantly share code, notes, and snippets.

@evanthebouncy
Created March 8, 2026 00:54
Show Gist options
  • Select an option

  • Save evanthebouncy/e940a6fd1a9cabe7a63748ce7d457172 to your computer and use it in GitHub Desktop.

Select an option

Save evanthebouncy/e940a6fd1a9cabe7a63748ce7d457172 to your computer and use it in GitHub Desktop.
import random
from time import time
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.envs.unity_gym_env import UnityToGymWrapper
unity_env = UnityEnvironment("gg_det")
env = UnityToGymWrapper(unity_env)
def get_reduced_observation(state):
return state[0:2] # Reduce to first two elements
def nn_act(nn_model, state):
if (nn_model is not None) and (random.random() < 0.8):
return nn_model.predict([get_reduced_observation(state)])[0]
return env.action_space.sample()
def nn_fit(replay_buffer):
nn_model = KNeighborsClassifier(n_neighbors=5)
if len(replay_buffer) > 10:
# sort by top 10 percent of performance
replay_buffer.sort(key=lambda x: x[1], reverse=True)
top_10_percent = replay_buffer[:len(replay_buffer) // 10]
X = []
y = []
# break the trajectory into (s,a) pairs, s goes to X, a goes to y, ignore the adjusted rewards
for roll_out, _ in top_10_percent:
for step in roll_out:
obs, action, reward = step
X.append(get_reduced_observation(obs))
y.append(action)
nn_model.fit(np.array(X), np.array(y))
return nn_model
return None
nn_model = None
replay_buffer = []
for i in range(10000):
print(f'Buffer size: {len(replay_buffer)}')
# print adjusted_reward of the last 10 things in the buffer
print(f'Adjusted Reward: {[x[1] for x in replay_buffer[-10:]]}')
# collect experience
for j in range(10):
obs = env.reset()
done = False
roll_out = []
while not done:
action = nn_act(nn_model, obs)
obs, reward, done, info = env.step(action)
roll_out.append((obs, action, reward))
# reward for making the opponent move further from center, at location 2 and 3
def get_opponent_distance_from_center(obs):
opponent_xy = obs[2:4]
return np.linalg.norm(opponent_xy)
# reward on entire trajectory on the furthest we can make the opponent
adjusted_reward = max(get_opponent_distance_from_center(step[0]) for step in roll_out)
replay_buffer.append((roll_out, adjusted_reward))
# fit model
nn_model = nn_fit(replay_buffer)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment