Skip to content

Instantly share code, notes, and snippets.

@MostHumble
Created January 17, 2026 06:28
Show Gist options
  • Select an option

  • Save MostHumble/6b294d97600684d7dbeb23ccaf2d1a7c to your computer and use it in GitHub Desktop.

Select an option

Save MostHumble/6b294d97600684d7dbeb23ccaf2d1a7c to your computer and use it in GitHub Desktop.
The Epsilon Trap - Experimental Code for Blog Post
import torch
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.animation as animation
from mpl_toolkits.mplot3d import Axes3D
import pandas as pd
# ============================================================================
# CONFIGURATION
# ============================================================================
# Optimization parameters
STEPS = 400
LEARNING_RATE = 0.5
START_POS = [1.0, 1.0]
SCALE_X = 1e-4 # Creates the "Micro-Canyon" geometry
# Epsilon values to test
EPS_LIST = [1e-6, 1e-8, 1e-10]
COLORS = ['blue', 'green', 'red'] # Explicit mapping for clarity
# Animation parameters
FPS = 15
DURATION_SEC = 16
# ============================================================================
# CORE FUNCTIONS
# ============================================================================
def loss_fn(x, y):
"""Micro-Canyon loss landscape: steep in Y, flat in X."""
return (x * SCALE_X)**2 + y**2
def run_optimizer(epsilon, track_metrics=False):
"""
Run Adam optimizer with specified epsilon value.
Args:
epsilon: Adam's epsilon hyperparameter
track_metrics: If True, also track gradients and update sizes
Returns:
If track_metrics=False: numpy array of shape (STEPS, 2) containing trajectory
If track_metrics=True: tuple of (trajectory, gradients_x, updates_x)
"""
pos = torch.tensor(START_POS, requires_grad=True, dtype=torch.float32)
optimizer = torch.optim.Adam([pos], lr=LEARNING_RATE, eps=epsilon)
path = []
grads_x = [] if track_metrics else None
updates_x = [] if track_metrics else None
for _ in range(STEPS):
# Store current position
p_prev = pos.detach().clone()
path.append(p_prev.numpy().copy())
# Compute gradients
optimizer.zero_grad()
loss = (pos[0] * SCALE_X)**2 + pos[1]**2
loss.backward()
# Track metrics if requested
if track_metrics:
grads_x.append(abs(pos.grad[0].item()))
# Take optimization step
optimizer.step()
# Track update size if requested
if track_metrics:
p_curr = pos.detach()
updates_x.append(abs(p_curr[0].item() - p_prev[0].item()))
path = np.array(path)
if track_metrics:
return path, np.array(grads_x), np.array(updates_x)
else:
return path
# ============================================================================
# VISUALIZATION 1: LOSS LANDSCAPE (ROTATING GIF)
# ============================================================================
def create_loss_landscape_gif(filename='elongated_canyon.gif'):
"""Generate rotating 3D visualization of the loss landscape."""
print("Generating loss landscape...")
# Generate surface data
X_range = np.linspace(-6.0, 6.0, 120)
Y_range = np.linspace(-2.0, 2.0, 120)
X_grid, Y_grid = np.meshgrid(X_range, Y_range)
Z_grid = loss_fn(X_grid, Y_grid)
# Setup figure
fig = plt.figure(figsize=(10, 5))
ax = fig.add_subplot(111, projection='3d')
# Plot surface and contours
ax.plot_surface(X_grid, Y_grid, Z_grid, cmap='viridis',
alpha=0.8, edgecolor='none', rstride=2, cstride=2, antialiased=True)
ax.contour(X_grid, Y_grid, Z_grid, zdir='z', offset=0,
levels=25, cmap='viridis', alpha=0.6)
# Styling
ax.set_title("Micro-Canyon Landscape")
ax.set_xlabel('X (Flat)')
ax.set_ylabel('Y (Steep)')
ax.set_zlabel('Loss')
ax.set_xlim(X_range.min(), X_range.max())
ax.set_ylim(Y_range.min(), Y_range.max())
ax.set_zlim(0, Z_grid.max())
ax.set_yticks([-2, -1, 0, 1, 2])
ax.set_box_aspect((3, 1, 1))
# Remove pane fills
ax.xaxis.pane.fill = False
ax.yaxis.pane.fill = False
ax.zaxis.pane.fill = False
# Animation function
total_frames = FPS * DURATION_SEC
def update(frame):
progress = frame / total_frames
azim = 45 + (progress * 360)
elev = 30 + 5 * np.sin(progress * 2 * np.pi)
ax.view_init(elev=elev, azim=azim)
# Create and save animation
print(f"Rendering animation ({total_frames} frames)...")
ani = animation.FuncAnimation(fig, update, frames=total_frames,
interval=1000/FPS, blit=False)
ani.save(filename, writer='pillow', fps=FPS)
print(f"Saved to {filename}")
plt.close()
# ============================================================================
# VISUALIZATION 2: TRAJECTORY RACE (DUAL-VIEW GIF)
# ============================================================================
def create_trajectory_race_gif(trajectories, filename='adam_3d_eps_race.gif'):
"""Generate dual-view animation showing optimizer trajectories."""
print("Generating trajectory race animation...")
fig = plt.figure(figsize=(16, 8))
# Create two 3D subplots
ax1 = fig.add_subplot(1, 2, 1, projection='3d')
ax2 = fig.add_subplot(1, 2, 2, projection='3d')
# Surface data for context
X = np.linspace(-0.2, 1.2, 50)
Y = np.linspace(-0.5, 1.5, 50)
X_grid, Y_grid = np.meshgrid(X, Y)
Z_grid = loss_fn(X_grid, Y_grid)
# Setup both axes
for ax, title in zip([ax1, ax2],
["View 1: Isometric Overview",
"View 2: Valley Floor (Worm's Eye)"]):
ax.plot_surface(X_grid, Y_grid, Z_grid, cmap='gray',
alpha=0.15, edgecolor='none')
ax.contour(X_grid, Y_grid, Z_grid, zdir='z', offset=0,
levels=20, cmap='gray', alpha=0.3)
ax.set_title(title)
ax.set_xlabel('X (Flat)')
ax.set_ylabel('Y (Steep)')
ax.set_zlabel('Loss')
ax.set_xlim(-0.2, 1.2)
ax.set_ylim(-0.5, 1.5)
ax.set_zlim(0, 1.5)
# Set viewing angles
ax1.view_init(elev=30, azim=-60) # Isometric
ax2.view_init(elev=5, azim=-10) # Worm's eye
# Initialize lines and dots for each epsilon
labels = [f'eps={eps}' for eps in EPS_LIST]
lines_ax1 = [ax1.plot([], [], [], color=c, lw=2, label=l)[0]
for c, l in zip(COLORS, labels)]
dots_ax1 = [ax1.plot([], [], [], color=c, marker='o', markersize=8)[0]
for c in COLORS]
lines_ax2 = [ax2.plot([], [], [], color=c, lw=2)[0] for c in COLORS]
dots_ax2 = [ax2.plot([], [], [], color=c, marker='o', markersize=8)[0]
for c in COLORS]
ax1.legend(loc='upper right')
# Animation update function
def update(frame):
idx = min(frame * 2, STEPS - 1) # Speed up 2x
for j, path in enumerate(trajectories):
i = min(idx, len(path) - 1)
# Trajectory up to current frame
xs = path[:i+1, 0]
ys = path[:i+1, 1]
zs = loss_fn(xs, ys)
# Current position
cx, cy = [path[i, 0]], [path[i, 1]]
cz = [loss_fn(path[i, 0], path[i, 1])]
# Update both views
for lines, dots in [(lines_ax1, dots_ax1), (lines_ax2, dots_ax2)]:
lines[j].set_data(xs, ys)
lines[j].set_3d_properties(zs)
dots[j].set_data(cx, cy)
dots[j].set_3d_properties(cz)
return lines_ax1 + dots_ax1 + lines_ax2 + dots_ax2
# Create and save animation
ani = animation.FuncAnimation(fig, update, frames=STEPS//2,
interval=30, blit=False)
ani.save(filename, writer='pillow', fps=30)
print(f"Saved to {filename}")
plt.close()
# ============================================================================
# VISUALIZATION 3: AMPLIFICATION FACTOR ANALYSIS
# ============================================================================
def plot_amplification_analysis(results, filename='epsilon_amplification.png'):
"""
Plot how epsilon affects the amplification factor (step size / gradient).
Shows when optimizers enter the 'epsilon trap' regime.
"""
print("Generating amplification analysis...")
plt.style.use('seaborn-v0_8-whitegrid')
fig, ax = plt.subplots(figsize=(12, 8))
jet_colors = cm.jet(np.linspace(0, 1, len(EPS_LIST)))
for i, eps in enumerate(EPS_LIST):
color = jet_colors[i]
data = results[eps]
# Filter valid data
mask = data['grads'] > 1e-12
grads = data['grads'][mask]
# Amplification = (step_size / learning_rate) / gradient
gains = (data['updates'][mask] / LEARNING_RATE) / grads
# Plot raw scatter (faint)
ax.scatter(grads, gains, color=color, s=15, alpha=0.12, edgecolors='none')
# Calculate binned median trend
df = pd.DataFrame({'grad': grads, 'gain': gains})
bins = np.logspace(np.log10(grads.min()), np.log10(grads.max()), 30)
df['bin'] = pd.cut(df['grad'], bins, include_lowest=True)
grouped = df.groupby('bin', observed=True)['gain']
trend = grouped.median()
lower = grouped.quantile(0.25)
upper = grouped.quantile(0.75)
bin_centers = [interval.mid for interval in trend.index]
# Plot median trend
ax.plot(bin_centers, trend.values, color=color, linewidth=4,
label=rf'$\epsilon = 10^{{{int(np.log10(eps))}}}$')
# Shade IQR region
ax.fill_between(bin_centers, lower.values, upper.values,
color=color, alpha=0.20, edgecolor='none')
# Theoretical ceiling (1/epsilon)
ceiling = 1 / eps
ax.axhline(ceiling, color=color, linestyle='--', linewidth=1.8,
alpha=0.65, label=f'Ceiling: 1/ε = {ceiling:.0e}')
# Formatting
ax.set_xlabel(r'Gradient Magnitude (log scale)', fontsize=14)
ax.set_ylabel(r'Amplification Factor (log scale)', fontsize=14)
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlim(1e-10, 1e-7)
ax.set_ylim(1e4, 1e11)
ax.legend(fontsize=12, loc='upper right', frameon=True, shadow=True)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(filename, dpi=300, bbox_inches='tight')
print(f"Saved to {filename}")
plt.close()
# ============================================================================
# UTILITY: CHECK FLOATING POINT PRECISION
# ============================================================================
def inspect_dtype(dtype, name):
"""Print floating point characteristics for a given dtype."""
print(f"\n--- {name} ---")
info = torch.finfo(dtype)
print(f"Machine Epsilon (eps): {info.eps}")
print(f"Smallest Normal (tiny): {info.tiny}")
print(f"Max Value: {info.max}")
# Find smallest subnormal empirically
val = torch.tensor(info.tiny, dtype=dtype)
smallest_subnormal = val
while val > 0:
smallest_subnormal = val
val = val / 2
print(f"True Smallest (Subnormal): {smallest_subnormal.item()}")
print("-" * 50)
def check_precision_limits():
"""Check FP16 and BF16 precision characteristics."""
print("\nFloating Point Precision Analysis")
print("=" * 50)
try:
inspect_dtype(torch.bfloat16, "BFLOAT16")
except TypeError:
print("BFLOAT16 not supported on this device/version.")
inspect_dtype(torch.float16, "FP16 (Half)")
inspect_dtype(torch.float32, "FP32 (Float)")
# ============================================================================
# MAIN EXPERIMENT PIPELINE
# ============================================================================
def run_full_experiment():
"""Execute complete experimental pipeline."""
print("=" * 70)
print("THE EPSILON TRAP EXPERIMENT")
print("=" * 70)
# Step 1: Check precision limits
check_precision_limits()
# Step 2: Generate trajectories
print("\n" + "=" * 70)
print("Running optimizers...")
print("=" * 70)
trajectories = []
results = {}
for eps, color in zip(EPS_LIST, COLORS):
print(f"\nRunning eps={eps}...")
path, grads, updates = run_optimizer(eps, track_metrics=True)
trajectories.append(path)
results[eps] = {
'path': path,
'grads': grads,
'updates': updates,
'color': color
}
print(f" Final position: {path[-1]}")
print(f" Steps taken: {len(path)}")
# Step 3: Create visualizations
print("\n" + "=" * 70)
print("Generating visualizations...")
print("=" * 70)
create_loss_landscape_gif()
create_trajectory_race_gif(trajectories)
plot_amplification_analysis(results)
print("\n" + "=" * 70)
print("EXPERIMENT COMPLETE!")
print("=" * 70)
print("\nGenerated files:")
print(" - elongated_canyon.gif (rotating loss landscape)")
print(" - adam_3d_eps_race.gif (trajectory comparison)")
print(" - epsilon_amplification.png (amplification analysis)")
return results
# ============================================================================
# ENTRY POINT
# ============================================================================
if __name__ == "__main__":
results = run_full_experiment()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment