Created
January 17, 2026 06:28
-
-
Save MostHumble/6b294d97600684d7dbeb23ccaf2d1a7c to your computer and use it in GitHub Desktop.
The Epsilon Trap - Experimental Code for Blog Post
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import torch | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import matplotlib.cm as cm | |
| import matplotlib.animation as animation | |
| from mpl_toolkits.mplot3d import Axes3D | |
| import pandas as pd | |
| # ============================================================================ | |
| # CONFIGURATION | |
| # ============================================================================ | |
| # Optimization parameters | |
| STEPS = 400 | |
| LEARNING_RATE = 0.5 | |
| START_POS = [1.0, 1.0] | |
| SCALE_X = 1e-4 # Creates the "Micro-Canyon" geometry | |
| # Epsilon values to test | |
| EPS_LIST = [1e-6, 1e-8, 1e-10] | |
| COLORS = ['blue', 'green', 'red'] # Explicit mapping for clarity | |
| # Animation parameters | |
| FPS = 15 | |
| DURATION_SEC = 16 | |
| # ============================================================================ | |
| # CORE FUNCTIONS | |
| # ============================================================================ | |
| def loss_fn(x, y): | |
| """Micro-Canyon loss landscape: steep in Y, flat in X.""" | |
| return (x * SCALE_X)**2 + y**2 | |
| def run_optimizer(epsilon, track_metrics=False): | |
| """ | |
| Run Adam optimizer with specified epsilon value. | |
| Args: | |
| epsilon: Adam's epsilon hyperparameter | |
| track_metrics: If True, also track gradients and update sizes | |
| Returns: | |
| If track_metrics=False: numpy array of shape (STEPS, 2) containing trajectory | |
| If track_metrics=True: tuple of (trajectory, gradients_x, updates_x) | |
| """ | |
| pos = torch.tensor(START_POS, requires_grad=True, dtype=torch.float32) | |
| optimizer = torch.optim.Adam([pos], lr=LEARNING_RATE, eps=epsilon) | |
| path = [] | |
| grads_x = [] if track_metrics else None | |
| updates_x = [] if track_metrics else None | |
| for _ in range(STEPS): | |
| # Store current position | |
| p_prev = pos.detach().clone() | |
| path.append(p_prev.numpy().copy()) | |
| # Compute gradients | |
| optimizer.zero_grad() | |
| loss = (pos[0] * SCALE_X)**2 + pos[1]**2 | |
| loss.backward() | |
| # Track metrics if requested | |
| if track_metrics: | |
| grads_x.append(abs(pos.grad[0].item())) | |
| # Take optimization step | |
| optimizer.step() | |
| # Track update size if requested | |
| if track_metrics: | |
| p_curr = pos.detach() | |
| updates_x.append(abs(p_curr[0].item() - p_prev[0].item())) | |
| path = np.array(path) | |
| if track_metrics: | |
| return path, np.array(grads_x), np.array(updates_x) | |
| else: | |
| return path | |
| # ============================================================================ | |
| # VISUALIZATION 1: LOSS LANDSCAPE (ROTATING GIF) | |
| # ============================================================================ | |
| def create_loss_landscape_gif(filename='elongated_canyon.gif'): | |
| """Generate rotating 3D visualization of the loss landscape.""" | |
| print("Generating loss landscape...") | |
| # Generate surface data | |
| X_range = np.linspace(-6.0, 6.0, 120) | |
| Y_range = np.linspace(-2.0, 2.0, 120) | |
| X_grid, Y_grid = np.meshgrid(X_range, Y_range) | |
| Z_grid = loss_fn(X_grid, Y_grid) | |
| # Setup figure | |
| fig = plt.figure(figsize=(10, 5)) | |
| ax = fig.add_subplot(111, projection='3d') | |
| # Plot surface and contours | |
| ax.plot_surface(X_grid, Y_grid, Z_grid, cmap='viridis', | |
| alpha=0.8, edgecolor='none', rstride=2, cstride=2, antialiased=True) | |
| ax.contour(X_grid, Y_grid, Z_grid, zdir='z', offset=0, | |
| levels=25, cmap='viridis', alpha=0.6) | |
| # Styling | |
| ax.set_title("Micro-Canyon Landscape") | |
| ax.set_xlabel('X (Flat)') | |
| ax.set_ylabel('Y (Steep)') | |
| ax.set_zlabel('Loss') | |
| ax.set_xlim(X_range.min(), X_range.max()) | |
| ax.set_ylim(Y_range.min(), Y_range.max()) | |
| ax.set_zlim(0, Z_grid.max()) | |
| ax.set_yticks([-2, -1, 0, 1, 2]) | |
| ax.set_box_aspect((3, 1, 1)) | |
| # Remove pane fills | |
| ax.xaxis.pane.fill = False | |
| ax.yaxis.pane.fill = False | |
| ax.zaxis.pane.fill = False | |
| # Animation function | |
| total_frames = FPS * DURATION_SEC | |
| def update(frame): | |
| progress = frame / total_frames | |
| azim = 45 + (progress * 360) | |
| elev = 30 + 5 * np.sin(progress * 2 * np.pi) | |
| ax.view_init(elev=elev, azim=azim) | |
| # Create and save animation | |
| print(f"Rendering animation ({total_frames} frames)...") | |
| ani = animation.FuncAnimation(fig, update, frames=total_frames, | |
| interval=1000/FPS, blit=False) | |
| ani.save(filename, writer='pillow', fps=FPS) | |
| print(f"Saved to {filename}") | |
| plt.close() | |
| # ============================================================================ | |
| # VISUALIZATION 2: TRAJECTORY RACE (DUAL-VIEW GIF) | |
| # ============================================================================ | |
| def create_trajectory_race_gif(trajectories, filename='adam_3d_eps_race.gif'): | |
| """Generate dual-view animation showing optimizer trajectories.""" | |
| print("Generating trajectory race animation...") | |
| fig = plt.figure(figsize=(16, 8)) | |
| # Create two 3D subplots | |
| ax1 = fig.add_subplot(1, 2, 1, projection='3d') | |
| ax2 = fig.add_subplot(1, 2, 2, projection='3d') | |
| # Surface data for context | |
| X = np.linspace(-0.2, 1.2, 50) | |
| Y = np.linspace(-0.5, 1.5, 50) | |
| X_grid, Y_grid = np.meshgrid(X, Y) | |
| Z_grid = loss_fn(X_grid, Y_grid) | |
| # Setup both axes | |
| for ax, title in zip([ax1, ax2], | |
| ["View 1: Isometric Overview", | |
| "View 2: Valley Floor (Worm's Eye)"]): | |
| ax.plot_surface(X_grid, Y_grid, Z_grid, cmap='gray', | |
| alpha=0.15, edgecolor='none') | |
| ax.contour(X_grid, Y_grid, Z_grid, zdir='z', offset=0, | |
| levels=20, cmap='gray', alpha=0.3) | |
| ax.set_title(title) | |
| ax.set_xlabel('X (Flat)') | |
| ax.set_ylabel('Y (Steep)') | |
| ax.set_zlabel('Loss') | |
| ax.set_xlim(-0.2, 1.2) | |
| ax.set_ylim(-0.5, 1.5) | |
| ax.set_zlim(0, 1.5) | |
| # Set viewing angles | |
| ax1.view_init(elev=30, azim=-60) # Isometric | |
| ax2.view_init(elev=5, azim=-10) # Worm's eye | |
| # Initialize lines and dots for each epsilon | |
| labels = [f'eps={eps}' for eps in EPS_LIST] | |
| lines_ax1 = [ax1.plot([], [], [], color=c, lw=2, label=l)[0] | |
| for c, l in zip(COLORS, labels)] | |
| dots_ax1 = [ax1.plot([], [], [], color=c, marker='o', markersize=8)[0] | |
| for c in COLORS] | |
| lines_ax2 = [ax2.plot([], [], [], color=c, lw=2)[0] for c in COLORS] | |
| dots_ax2 = [ax2.plot([], [], [], color=c, marker='o', markersize=8)[0] | |
| for c in COLORS] | |
| ax1.legend(loc='upper right') | |
| # Animation update function | |
| def update(frame): | |
| idx = min(frame * 2, STEPS - 1) # Speed up 2x | |
| for j, path in enumerate(trajectories): | |
| i = min(idx, len(path) - 1) | |
| # Trajectory up to current frame | |
| xs = path[:i+1, 0] | |
| ys = path[:i+1, 1] | |
| zs = loss_fn(xs, ys) | |
| # Current position | |
| cx, cy = [path[i, 0]], [path[i, 1]] | |
| cz = [loss_fn(path[i, 0], path[i, 1])] | |
| # Update both views | |
| for lines, dots in [(lines_ax1, dots_ax1), (lines_ax2, dots_ax2)]: | |
| lines[j].set_data(xs, ys) | |
| lines[j].set_3d_properties(zs) | |
| dots[j].set_data(cx, cy) | |
| dots[j].set_3d_properties(cz) | |
| return lines_ax1 + dots_ax1 + lines_ax2 + dots_ax2 | |
| # Create and save animation | |
| ani = animation.FuncAnimation(fig, update, frames=STEPS//2, | |
| interval=30, blit=False) | |
| ani.save(filename, writer='pillow', fps=30) | |
| print(f"Saved to {filename}") | |
| plt.close() | |
| # ============================================================================ | |
| # VISUALIZATION 3: AMPLIFICATION FACTOR ANALYSIS | |
| # ============================================================================ | |
| def plot_amplification_analysis(results, filename='epsilon_amplification.png'): | |
| """ | |
| Plot how epsilon affects the amplification factor (step size / gradient). | |
| Shows when optimizers enter the 'epsilon trap' regime. | |
| """ | |
| print("Generating amplification analysis...") | |
| plt.style.use('seaborn-v0_8-whitegrid') | |
| fig, ax = plt.subplots(figsize=(12, 8)) | |
| jet_colors = cm.jet(np.linspace(0, 1, len(EPS_LIST))) | |
| for i, eps in enumerate(EPS_LIST): | |
| color = jet_colors[i] | |
| data = results[eps] | |
| # Filter valid data | |
| mask = data['grads'] > 1e-12 | |
| grads = data['grads'][mask] | |
| # Amplification = (step_size / learning_rate) / gradient | |
| gains = (data['updates'][mask] / LEARNING_RATE) / grads | |
| # Plot raw scatter (faint) | |
| ax.scatter(grads, gains, color=color, s=15, alpha=0.12, edgecolors='none') | |
| # Calculate binned median trend | |
| df = pd.DataFrame({'grad': grads, 'gain': gains}) | |
| bins = np.logspace(np.log10(grads.min()), np.log10(grads.max()), 30) | |
| df['bin'] = pd.cut(df['grad'], bins, include_lowest=True) | |
| grouped = df.groupby('bin', observed=True)['gain'] | |
| trend = grouped.median() | |
| lower = grouped.quantile(0.25) | |
| upper = grouped.quantile(0.75) | |
| bin_centers = [interval.mid for interval in trend.index] | |
| # Plot median trend | |
| ax.plot(bin_centers, trend.values, color=color, linewidth=4, | |
| label=rf'$\epsilon = 10^{{{int(np.log10(eps))}}}$') | |
| # Shade IQR region | |
| ax.fill_between(bin_centers, lower.values, upper.values, | |
| color=color, alpha=0.20, edgecolor='none') | |
| # Theoretical ceiling (1/epsilon) | |
| ceiling = 1 / eps | |
| ax.axhline(ceiling, color=color, linestyle='--', linewidth=1.8, | |
| alpha=0.65, label=f'Ceiling: 1/ε = {ceiling:.0e}') | |
| # Formatting | |
| ax.set_xlabel(r'Gradient Magnitude (log scale)', fontsize=14) | |
| ax.set_ylabel(r'Amplification Factor (log scale)', fontsize=14) | |
| ax.set_xscale('log') | |
| ax.set_yscale('log') | |
| ax.set_xlim(1e-10, 1e-7) | |
| ax.set_ylim(1e4, 1e11) | |
| ax.legend(fontsize=12, loc='upper right', frameon=True, shadow=True) | |
| ax.grid(True, alpha=0.3) | |
| plt.tight_layout() | |
| plt.savefig(filename, dpi=300, bbox_inches='tight') | |
| print(f"Saved to {filename}") | |
| plt.close() | |
| # ============================================================================ | |
| # UTILITY: CHECK FLOATING POINT PRECISION | |
| # ============================================================================ | |
| def inspect_dtype(dtype, name): | |
| """Print floating point characteristics for a given dtype.""" | |
| print(f"\n--- {name} ---") | |
| info = torch.finfo(dtype) | |
| print(f"Machine Epsilon (eps): {info.eps}") | |
| print(f"Smallest Normal (tiny): {info.tiny}") | |
| print(f"Max Value: {info.max}") | |
| # Find smallest subnormal empirically | |
| val = torch.tensor(info.tiny, dtype=dtype) | |
| smallest_subnormal = val | |
| while val > 0: | |
| smallest_subnormal = val | |
| val = val / 2 | |
| print(f"True Smallest (Subnormal): {smallest_subnormal.item()}") | |
| print("-" * 50) | |
| def check_precision_limits(): | |
| """Check FP16 and BF16 precision characteristics.""" | |
| print("\nFloating Point Precision Analysis") | |
| print("=" * 50) | |
| try: | |
| inspect_dtype(torch.bfloat16, "BFLOAT16") | |
| except TypeError: | |
| print("BFLOAT16 not supported on this device/version.") | |
| inspect_dtype(torch.float16, "FP16 (Half)") | |
| inspect_dtype(torch.float32, "FP32 (Float)") | |
| # ============================================================================ | |
| # MAIN EXPERIMENT PIPELINE | |
| # ============================================================================ | |
| def run_full_experiment(): | |
| """Execute complete experimental pipeline.""" | |
| print("=" * 70) | |
| print("THE EPSILON TRAP EXPERIMENT") | |
| print("=" * 70) | |
| # Step 1: Check precision limits | |
| check_precision_limits() | |
| # Step 2: Generate trajectories | |
| print("\n" + "=" * 70) | |
| print("Running optimizers...") | |
| print("=" * 70) | |
| trajectories = [] | |
| results = {} | |
| for eps, color in zip(EPS_LIST, COLORS): | |
| print(f"\nRunning eps={eps}...") | |
| path, grads, updates = run_optimizer(eps, track_metrics=True) | |
| trajectories.append(path) | |
| results[eps] = { | |
| 'path': path, | |
| 'grads': grads, | |
| 'updates': updates, | |
| 'color': color | |
| } | |
| print(f" Final position: {path[-1]}") | |
| print(f" Steps taken: {len(path)}") | |
| # Step 3: Create visualizations | |
| print("\n" + "=" * 70) | |
| print("Generating visualizations...") | |
| print("=" * 70) | |
| create_loss_landscape_gif() | |
| create_trajectory_race_gif(trajectories) | |
| plot_amplification_analysis(results) | |
| print("\n" + "=" * 70) | |
| print("EXPERIMENT COMPLETE!") | |
| print("=" * 70) | |
| print("\nGenerated files:") | |
| print(" - elongated_canyon.gif (rotating loss landscape)") | |
| print(" - adam_3d_eps_race.gif (trajectory comparison)") | |
| print(" - epsilon_amplification.png (amplification analysis)") | |
| return results | |
| # ============================================================================ | |
| # ENTRY POINT | |
| # ============================================================================ | |
| if __name__ == "__main__": | |
| results = run_full_experiment() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment