Created
March 6, 2026 06:40
-
-
Save albertbuchard/335f297fb6382a0777c948b8506c6f67 to your computer and use it in GitHub Desktop.
Bizzaro Karpathy Experiment Runner
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| from __future__ import annotations | |
| import argparse | |
| import datetime as dt | |
| import json | |
| import shlex | |
| import subprocess | |
| import sys | |
| import time | |
| import uuid | |
| from dataclasses import asdict, dataclass | |
| from pathlib import Path | |
| from typing import Any, Dict, List, Optional, Tuple | |
| from rich.columns import Columns | |
| from rich.console import Console, Group | |
| from rich.live import Live | |
| from rich.panel import Panel | |
| from rich.text import Text | |
| console = Console(color_system="truecolor") | |
| def utc_now() -> str: | |
| return dt.datetime.utcnow().replace(microsecond=0).isoformat() + "Z" | |
| def slugify(s: str) -> str: | |
| out = [] | |
| for ch in s.lower(): | |
| if ch.isalnum(): | |
| out.append(ch) | |
| elif ch in (" ", "-", "_", "/", "."): | |
| out.append("-") | |
| text = "".join(out) | |
| while "--" in text: | |
| text = text.replace("--", "-") | |
| return text.strip("-") or "experiment" | |
| def run( | |
| cmd: List[str], | |
| cwd: Optional[Path] = None, | |
| check: bool = True, | |
| capture: bool = True, | |
| ) -> subprocess.CompletedProcess: | |
| return subprocess.run( | |
| cmd, | |
| cwd=str(cwd) if cwd else None, | |
| check=check, | |
| text=True, | |
| capture_output=capture, | |
| ) | |
| def run_shell(command: str, cwd: Optional[Path] = None, check: bool = True) -> subprocess.CompletedProcess: | |
| return subprocess.run( | |
| command, | |
| cwd=str(cwd) if cwd else None, | |
| shell=True, | |
| check=check, | |
| text=True, | |
| capture_output=True, | |
| executable="/bin/bash", | |
| ) | |
| def git(repo: Path, *args: str, check: bool = True) -> subprocess.CompletedProcess: | |
| return run(["git", *args], cwd=repo, check=check) | |
| def current_branch(repo: Path) -> str: | |
| return git(repo, "rev-parse", "--abbrev-ref", "HEAD").stdout.strip() | |
| def ensure_clean_worktree(repo: Path) -> None: | |
| status = git(repo, "status", "--porcelain").stdout.strip() | |
| if status: | |
| raise RuntimeError("Working tree is not clean. Commit or stash changes before running.") | |
| def file_exists(repo: Path, relpath: str) -> bool: | |
| return (repo / relpath).exists() | |
| def safe_json_loads(text: str) -> Optional[Any]: | |
| try: | |
| return json.loads(text) | |
| except Exception: | |
| return None | |
| def extract_json_objects_from_jsonl(text: str) -> List[dict]: | |
| objs: List[dict] = [] | |
| for line in text.splitlines(): | |
| line = line.strip() | |
| if not line: | |
| continue | |
| obj = safe_json_loads(line) | |
| if isinstance(obj, dict): | |
| objs.append(obj) | |
| return objs | |
| @dataclass | |
| class ExperimentRecord: | |
| date: str | |
| iteration: int | |
| branch: str | |
| name_of_change: str | |
| exact_change: str | |
| hypothesis: str | |
| tests_passed: bool | |
| experiment_ran: bool | |
| metric_name: Optional[str] | |
| metric_before: Optional[float] | |
| metric_after: Optional[float] | |
| memory_gb: Optional[float] | |
| time_min: Optional[float] | |
| improved: bool | |
| passed_or_discarded: str | |
| codex_stdout: str | |
| codex_stderr: str | |
| test_logs: str | |
| experiment_logs: str | |
| merge_commit: Optional[str] | |
| class JsonStateLog: | |
| def __init__(self, path: Path, experiment_name: str, root_branch: str): | |
| self.path = path | |
| self.data: Dict[str, Any] = { | |
| "experiment_name": experiment_name, | |
| "uuid": path.stem.split("_")[-2] if "_" in path.stem else str(uuid.uuid4()), | |
| "created_at": utc_now(), | |
| "root_branch": root_branch, | |
| "status": { | |
| "phase": "initialized", | |
| "current_iteration": 0, | |
| "current_branch": root_branch, | |
| "best_metric": None, | |
| "best_metric_name": None, | |
| "bootstrap_done": False, | |
| "last_error": None, | |
| "finished": False, | |
| "current_step_message": "starting", | |
| }, | |
| "experiments": [], | |
| } | |
| self.flush() | |
| def set_status(self, **kwargs: Any) -> None: | |
| self.data["status"].update(kwargs) | |
| self.flush() | |
| def add_experiment(self, rec: ExperimentRecord) -> None: | |
| self.data["experiments"].append(asdict(rec)) | |
| self.flush() | |
| def replace_last_experiment(self, rec: ExperimentRecord) -> None: | |
| if self.data["experiments"]: | |
| self.data["experiments"][-1] = asdict(rec) | |
| else: | |
| self.data["experiments"].append(asdict(rec)) | |
| self.flush() | |
| def flush(self) -> None: | |
| self.path.write_text(json.dumps(self.data, indent=2), encoding="utf-8") | |
| class ExperimentUI: | |
| def __init__(self, title: str): | |
| self.title = title | |
| def _status_style(self, status: str) -> str: | |
| s = (status or "").lower() | |
| if s == "merged": | |
| return "bold green" | |
| if s == "discarded": | |
| return "bold yellow" | |
| if s == "failed": | |
| return "bold red" | |
| if s == "running": | |
| return "bold cyan" | |
| return "white" | |
| def _status_label(self, status: str) -> str: | |
| s = (status or "").lower() | |
| if s == "merged": | |
| return "PASSED" | |
| if s == "discarded": | |
| return "DISCARDED" | |
| if s == "failed": | |
| return "FAILED" | |
| if s == "running": | |
| return "RUNNING" | |
| return status.upper() or "UNKNOWN" | |
| def _short_commitish(self, exp: dict) -> str: | |
| merge_commit = exp.get("merge_commit") | |
| if isinstance(merge_commit, str) and merge_commit.strip(): | |
| return merge_commit[:7] | |
| codex_stdout = exp.get("codex_stdout", "") | |
| for line in codex_stdout.splitlines(): | |
| if "commit" in line.lower(): | |
| parts = line.strip().split() | |
| for part in parts: | |
| if len(part) >= 7 and all(c in "0123456789abcdef" for c in part[:7].lower()): | |
| return part[:7] | |
| branch = exp.get("branch", "") | |
| tail = branch.split("-")[-1] if branch else "" | |
| return (tail or "-------")[:7] | |
| def _format_metric_triplet(self, exp: dict) -> Text: | |
| commit = self._short_commitish(exp) | |
| memory_gb = exp.get("memory_gb") | |
| time_min = exp.get("time_min") | |
| metric_after = exp.get("metric_after") | |
| mem_s = f"{memory_gb:.1f}" if isinstance(memory_gb, (int, float)) else "-" | |
| time_s = f"{time_min:.2f}" if isinstance(time_min, (int, float)) else "-" | |
| metric_s = f"{metric_after:.6f}" if isinstance(metric_after, (int, float)) else "-" | |
| desc = exp.get("name_of_change", "") or exp.get("hypothesis", "") or "-" | |
| desc = desc.replace("\n", " ").strip() | |
| if len(desc) > 72: | |
| desc = desc[:69] + "..." | |
| status = self._status_label(exp.get("passed_or_discarded", "")) | |
| status_style = self._status_style(exp.get("passed_or_discarded", "")) | |
| line = Text() | |
| line.append(f"{commit},", style="white") | |
| line.append(f"{mem_s},", style="bright_white") | |
| line.append(f"{time_s},", style="bright_white") | |
| line.append(f"{metric_s},", style="bold white") | |
| line.append(desc, style="white") | |
| line.append(" — ", style="dim") | |
| line.append(status, style=status_style) | |
| return line | |
| def _format_row(self, idx: int, exp: dict) -> Text: | |
| t = Text() | |
| t.append(f"{idx:>3} ", style="dim") | |
| t.append_text(self._format_metric_triplet(exp)) | |
| return t | |
| def _build_history_columns(self, data: dict) -> Columns: | |
| experiments = data.get("experiments", []) | |
| left = Text() | |
| right = Text() | |
| header = Text(" commit,memory_gb,time_min,val_bpb,description\n", style="dim") | |
| left.append_text(header) | |
| right.append_text(header) | |
| if not experiments: | |
| left.append(" no experiments yet\n", style="dim") | |
| return Columns([left, right], expand=True, equal=True) | |
| rows = [self._format_row(i + 1, exp) for i, exp in enumerate(experiments)] | |
| split = (len(rows) + 1) // 2 | |
| left_rows = rows[:split] | |
| right_rows = rows[split:] | |
| for r in left_rows: | |
| left.append_text(r) | |
| left.append("\n") | |
| for r in right_rows: | |
| right.append_text(r) | |
| right.append("\n") | |
| return Columns([left, right], expand=True, equal=True) | |
| def _summary_line(self, data: dict) -> Text: | |
| exps = data.get("experiments", []) | |
| merged = sum(1 for e in exps if e.get("passed_or_discarded") == "merged") | |
| discarded = sum(1 for e in exps if e.get("passed_or_discarded") == "discarded") | |
| failed = sum(1 for e in exps if e.get("passed_or_discarded") == "failed") | |
| running = sum(1 for e in exps if e.get("passed_or_discarded") == "running") | |
| status = data.get("status", {}) | |
| best_metric = status.get("best_metric") | |
| best_metric_name = status.get("best_metric_name") or "metric" | |
| txt = Text() | |
| txt.append("passed ", style="green") | |
| txt.append(str(merged), style="bold green") | |
| txt.append(" discarded ", style="yellow") | |
| txt.append(str(discarded), style="bold yellow") | |
| txt.append(" failed ", style="red") | |
| txt.append(str(failed), style="bold red") | |
| txt.append(" running ", style="cyan") | |
| txt.append(str(running), style="bold cyan") | |
| txt.append(" best ", style="white") | |
| if isinstance(best_metric, (int, float)): | |
| txt.append(f"{best_metric_name}={best_metric:.6f}", style="bold white") | |
| else: | |
| txt.append("-", style="dim") | |
| return txt | |
| def _status_block(self, data: dict) -> Panel: | |
| status = data.get("status", {}) | |
| root_branch = data.get("root_branch", "?") | |
| current_branch = status.get("current_branch", "?") | |
| current_iteration = status.get("current_iteration", 0) | |
| phase = status.get("phase", "?") | |
| msg = status.get("current_step_message", "-") | |
| last_error = status.get("last_error") | |
| body = Text() | |
| body.append("status\n", style="bold white") | |
| body.append(f"root_branch: {root_branch}\n", style="white") | |
| body.append(f"current_branch: {current_branch}\n", style="white") | |
| body.append(f"iteration: {current_iteration}\n", style="white") | |
| body.append(f"phase: {phase}\n", style="white") | |
| body.append(f"now: {msg}\n", style="bold cyan") | |
| if last_error: | |
| body.append(f"last_error: {last_error}\n", style="bold red") | |
| return Panel(body, border_style="white", title="Current status") | |
| def render(self, data: dict): | |
| history = self._build_history_columns(data) | |
| summary = self._summary_line(data) | |
| status = self._status_block(data) | |
| return Group( | |
| Panel(history, title=self.title, border_style="white"), | |
| Panel(summary, border_style="white"), | |
| status, | |
| ) | |
| class CodexAgent: | |
| def __init__(self, repo: Path, model: Optional[str] = None): | |
| self.repo = repo | |
| self.model = model | |
| def exec(self, prompt: str, full_auto: bool = True, extra_args: Optional[List[str]] = None) -> Tuple[str, str, int]: | |
| cmd = ["codex", "exec", "--json"] | |
| if full_auto: | |
| cmd.append("--full-auto") | |
| if self.model: | |
| cmd.extend(["--model", self.model]) | |
| if extra_args: | |
| cmd.extend(extra_args) | |
| cmd.append(prompt) | |
| proc = subprocess.run( | |
| cmd, | |
| cwd=str(self.repo), | |
| text=True, | |
| capture_output=True, | |
| ) | |
| return proc.stdout, proc.stderr, proc.returncode | |
| @staticmethod | |
| def summarize_jsonl_output(stdout: str) -> str: | |
| objs = extract_json_objects_from_jsonl(stdout) | |
| parts: List[str] = [] | |
| for obj in objs: | |
| for key in ("message", "content", "text", "summary"): | |
| val = obj.get(key) | |
| if isinstance(val, str) and val.strip(): | |
| parts.append(val.strip()) | |
| return "\n".join(parts).strip() or stdout.strip() | |
| class Experimenter: | |
| def __init__( | |
| self, | |
| repo: Path, | |
| objective: str, | |
| artifact_path: str, | |
| test_cmd: str, | |
| experiment_runner_path: str, | |
| max_iters: int, | |
| experiment_name: str, | |
| model: Optional[str] = None, | |
| ): | |
| self.repo = repo | |
| self.objective = objective | |
| self.artifact_path = artifact_path | |
| self.test_cmd = test_cmd | |
| self.experiment_runner_path = experiment_runner_path | |
| self.max_iters = max_iters | |
| self.root_branch = current_branch(repo) | |
| self.agent = CodexAgent(repo, model=model) | |
| self.ui = ExperimentUI(f"Experimenter · {experiment_name}") | |
| run_id = str(uuid.uuid4())[:8] | |
| log_name = f"{slugify(experiment_name)}_{run_id}_logs.json" | |
| self.log = JsonStateLog(repo / log_name, experiment_name, self.root_branch) | |
| self.best_metric: Optional[float] = None | |
| self.metric_name: Optional[str] = None | |
| self.live: Optional[Live] = None | |
| def refresh_ui(self) -> None: | |
| if self.live is not None: | |
| self.live.update(self.ui.render(self.log.data), refresh=True) | |
| def write_status(self, phase: str, current_step_message: Optional[str] = None, **extra: Any) -> None: | |
| payload = dict( | |
| phase=phase, | |
| current_branch=current_branch(self.repo), | |
| best_metric=self.best_metric, | |
| best_metric_name=self.metric_name, | |
| **extra, | |
| ) | |
| if current_step_message is not None: | |
| payload["current_step_message"] = current_step_message | |
| self.log.set_status(**payload) | |
| self.refresh_ui() | |
| def checkout_root(self) -> None: | |
| git(self.repo, "checkout", self.root_branch) | |
| def create_branch(self, name: str) -> None: | |
| git(self.repo, "checkout", "-B", name) | |
| def delete_branch(self, name: str) -> None: | |
| self.checkout_root() | |
| git(self.repo, "branch", "-D", name, check=False) | |
| def merge_branch(self, name: str) -> Optional[str]: | |
| self.checkout_root() | |
| git(self.repo, "merge", "--no-ff", name, "-m", f"Merge {name} from experimenter") | |
| merge_sha = git(self.repo, "rev-parse", "HEAD").stdout.strip() | |
| git(self.repo, "branch", "-D", name, check=False) | |
| return merge_sha | |
| def run_tests(self) -> Tuple[bool, str]: | |
| proc = run_shell(self.test_cmd, cwd=self.repo, check=False) | |
| ok = proc.returncode == 0 | |
| logs = f"$ {self.test_cmd}\n\nSTDOUT:\n{proc.stdout}\n\nSTDERR:\n{proc.stderr}" | |
| return ok, logs | |
| def run_experiment( | |
| self, | |
| ) -> Tuple[bool, Optional[str], Optional[float], Optional[float], Optional[float], str]: | |
| cmd = f"python {shlex.quote(self.experiment_runner_path)}" | |
| proc = run_shell(cmd, cwd=self.repo, check=False) | |
| logs = f"$ {cmd}\n\nSTDOUT:\n{proc.stdout}\n\nSTDERR:\n{proc.stderr}" | |
| if proc.returncode != 0: | |
| return False, None, None, None, None, logs | |
| parsed = safe_json_loads(proc.stdout.strip()) | |
| if not isinstance(parsed, dict): | |
| return False, None, None, None, None, logs + "\n\nRunner output was not valid JSON." | |
| metric_name = parsed.get("metric_name") | |
| metric_value = parsed.get("metric_value") | |
| memory_gb = parsed.get("memory_gb") | |
| time_min = parsed.get("time_min") | |
| if not isinstance(metric_name, str): | |
| return False, None, None, None, None, logs + "\n\nMissing metric_name." | |
| try: | |
| metric_value = float(metric_value) | |
| except Exception: | |
| return False, None, None, None, None, logs + "\n\nmetric_value was not numeric." | |
| try: | |
| memory_gb = float(memory_gb) if memory_gb is not None else None | |
| except Exception: | |
| memory_gb = None | |
| try: | |
| time_min = float(time_min) if time_min is not None else None | |
| except Exception: | |
| time_min = None | |
| return True, metric_name, metric_value, memory_gb, time_min, logs | |
| def add_running_placeholder(self, iteration: int, branch: str, name: str) -> None: | |
| self.log.add_experiment( | |
| ExperimentRecord( | |
| date=utc_now(), | |
| iteration=iteration, | |
| branch=branch, | |
| name_of_change=name, | |
| exact_change="", | |
| hypothesis="", | |
| tests_passed=False, | |
| experiment_ran=False, | |
| metric_name=self.metric_name, | |
| metric_before=self.best_metric, | |
| metric_after=None, | |
| memory_gb=None, | |
| time_min=None, | |
| improved=False, | |
| passed_or_discarded="running", | |
| codex_stdout="", | |
| codex_stderr="", | |
| test_logs="", | |
| experiment_logs="", | |
| merge_commit=None, | |
| ) | |
| ) | |
| self.refresh_ui() | |
| def bootstrap_runner(self) -> None: | |
| self.write_status("bootstrap_runner", "bootstrapping experiment runner") | |
| branch = f"exp/bootstrap-runner-{str(uuid.uuid4())[:8]}" | |
| self.create_branch(branch) | |
| self.add_running_placeholder(0, branch, "bootstrap experiment runner") | |
| prompt = f""" | |
| You are building the bootstrap experiment runner for an autonomous experiment loop. | |
| Repository root: {self.repo} | |
| Target artifact to optimize: {self.artifact_path} | |
| Objective: {self.objective} | |
| Create exactly one headless runner script at: | |
| {self.experiment_runner_path} | |
| Requirements: | |
| 1. The script must run one experiment end-to-end for the current repository state. | |
| 2. It must print ONLY valid JSON to stdout in the form: | |
| {{"metric_name": "<name>", "metric_value": <float>, "memory_gb": <float|null>, "time_min": <float|null>, "extra": {{...}}}} | |
| 3. The metric must be the optimized metric implied by: | |
| "{self.objective}" | |
| 4. Add any minimal helper code/config needed. | |
| 5. If there are tests, update or add a small smoke test if appropriate. | |
| 6. Commit your work locally on this branch with a clear git commit message. | |
| 7. Do not ask questions. Make reasonable assumptions and implement the smallest robust version. | |
| """ | |
| stdout, stderr, rc = self.agent.exec(prompt) | |
| if rc != 0: | |
| self.delete_branch(branch) | |
| self.log.replace_last_experiment( | |
| ExperimentRecord( | |
| date=utc_now(), | |
| iteration=0, | |
| branch=branch, | |
| name_of_change="bootstrap experiment runner", | |
| exact_change="codex bootstrap failed", | |
| hypothesis="Need a reproducible single-run experiment harness.", | |
| tests_passed=False, | |
| experiment_ran=False, | |
| metric_name=None, | |
| metric_before=None, | |
| metric_after=None, | |
| memory_gb=None, | |
| time_min=None, | |
| improved=False, | |
| passed_or_discarded="failed", | |
| codex_stdout=stdout, | |
| codex_stderr=stderr, | |
| test_logs="", | |
| experiment_logs="", | |
| merge_commit=None, | |
| ) | |
| ) | |
| self.refresh_ui() | |
| raise RuntimeError("Codex bootstrap failed.") | |
| if not file_exists(self.repo, self.experiment_runner_path): | |
| self.delete_branch(branch) | |
| raise RuntimeError(f"Bootstrap did not create {self.experiment_runner_path}") | |
| self.write_status("bootstrap_runner", "validating bootstrap tests") | |
| tests_ok, test_logs = self.run_tests() | |
| self.write_status("bootstrap_runner", "running bootstrap experiment") | |
| exp_ok, metric_name, metric_value, memory_gb, time_min, exp_logs = self.run_experiment() | |
| if not tests_ok or not exp_ok: | |
| self.delete_branch(branch) | |
| self.log.replace_last_experiment( | |
| ExperimentRecord( | |
| date=utc_now(), | |
| iteration=0, | |
| branch=branch, | |
| name_of_change="bootstrap experiment runner", | |
| exact_change=f"Created {self.experiment_runner_path} but validation failed.", | |
| hypothesis="Need a reproducible single-run experiment harness.", | |
| tests_passed=tests_ok, | |
| experiment_ran=exp_ok, | |
| metric_name=metric_name, | |
| metric_before=None, | |
| metric_after=metric_value, | |
| memory_gb=memory_gb, | |
| time_min=time_min, | |
| improved=False, | |
| passed_or_discarded="failed", | |
| codex_stdout=stdout, | |
| codex_stderr=stderr, | |
| test_logs=test_logs, | |
| experiment_logs=exp_logs, | |
| merge_commit=None, | |
| ) | |
| ) | |
| self.refresh_ui() | |
| raise RuntimeError("Bootstrap runner branch failed validation.") | |
| self.write_status("bootstrap_runner", "merging bootstrap branch") | |
| merge_sha = self.merge_branch(branch) | |
| self.best_metric = metric_value | |
| self.metric_name = metric_name | |
| self.log.replace_last_experiment( | |
| ExperimentRecord( | |
| date=utc_now(), | |
| iteration=0, | |
| branch=branch, | |
| name_of_change="bootstrap experiment runner", | |
| exact_change=f"Created {self.experiment_runner_path} and minimal support code.", | |
| hypothesis="Need a reproducible single-run experiment harness before iterative optimization.", | |
| tests_passed=tests_ok, | |
| experiment_ran=exp_ok, | |
| metric_name=metric_name, | |
| metric_before=None, | |
| metric_after=metric_value, | |
| memory_gb=memory_gb, | |
| time_min=time_min, | |
| improved=True, | |
| passed_or_discarded="merged", | |
| codex_stdout=stdout, | |
| codex_stderr=stderr, | |
| test_logs=test_logs, | |
| experiment_logs=exp_logs, | |
| merge_commit=merge_sha, | |
| ) | |
| ) | |
| self.refresh_ui() | |
| self.write_status("bootstrap_complete", "bootstrap finished", bootstrap_done=True) | |
| def run_iteration(self, i: int) -> None: | |
| self.write_status("iteration_start", f"starting iteration {i}", current_iteration=i) | |
| branch = f"exp/{slugify(self.metric_name or 'metric')}-iter-{i}-{str(uuid.uuid4())[:8]}" | |
| self.create_branch(branch) | |
| before_metric = self.best_metric | |
| self.add_running_placeholder(i, branch, f"iteration {i} hypothesis search") | |
| prompt = f""" | |
| You are one iteration of an autonomous ML/code experimenter. | |
| Root branch: {self.root_branch} | |
| Current branch: {branch} | |
| Optimization objective: {self.objective} | |
| Target artifact: {self.artifact_path} | |
| Experiment runner: {self.experiment_runner_path} | |
| Current best metric ({self.metric_name}): {self.best_metric} | |
| Your job: | |
| 1. Form one concrete hypothesis for improving the metric. | |
| 2. Implement the smallest useful change. | |
| 3. Add or update tests or validation as needed. | |
| 4. Commit your changes on this branch with a clear commit message. | |
| Hard requirements: | |
| - Work headlessly. | |
| - Do not ask questions. | |
| - Favor small, reversible changes. | |
| - Ensure the codebase remains runnable. | |
| - Do not merge branches yourself. | |
| - Assume lower metric is better. | |
| - Return enough detail in your final response to describe: | |
| - hypothesis | |
| - exact files changed | |
| - exact nature of the change | |
| Important: | |
| - The outer orchestrator will run tests and the experiment runner after you finish. | |
| - If you notice likely runtime issues, proactively fix them before finishing. | |
| """ | |
| self.write_status("iteration_coding", f"codex implementing iteration {i}", current_iteration=i) | |
| stdout, stderr, rc = self.agent.exec(prompt) | |
| codex_summary = self.agent.summarize_jsonl_output(stdout) | |
| if rc != 0: | |
| self.delete_branch(branch) | |
| self.log.replace_last_experiment( | |
| ExperimentRecord( | |
| date=utc_now(), | |
| iteration=i, | |
| branch=branch, | |
| name_of_change=f"iteration {i} failed before validation", | |
| exact_change="", | |
| hypothesis="", | |
| tests_passed=False, | |
| experiment_ran=False, | |
| metric_name=self.metric_name, | |
| metric_before=before_metric, | |
| metric_after=None, | |
| memory_gb=None, | |
| time_min=None, | |
| improved=False, | |
| passed_or_discarded="failed", | |
| codex_stdout=stdout, | |
| codex_stderr=stderr, | |
| test_logs="", | |
| experiment_logs="", | |
| merge_commit=None, | |
| ) | |
| ) | |
| self.refresh_ui() | |
| return | |
| self.write_status("iteration_validation", f"running tests for iteration {i}", current_iteration=i) | |
| tests_ok, test_logs = self.run_tests() | |
| self.write_status("iteration_validation", f"running experiment for iteration {i}", current_iteration=i) | |
| exp_ok, metric_name, metric_after, memory_gb, time_min, exp_logs = self.run_experiment() | |
| if not tests_ok or not exp_ok: | |
| repair_prompt = f""" | |
| The previous implementation on this branch did not validate. | |
| Please repair the branch until: | |
| 1. tests pass using: | |
| {self.test_cmd} | |
| 2. the experiment runner succeeds: | |
| python {self.experiment_runner_path} | |
| Do not change the overall goal. | |
| Commit any repair changes with a clear commit message. | |
| """ | |
| self.write_status("iteration_repair", f"repairing iteration {i}", current_iteration=i) | |
| r_stdout, r_stderr, _ = self.agent.exec(repair_prompt) | |
| stdout += "\n\n--- REPAIR ATTEMPT ---\n\n" + r_stdout | |
| stderr += "\n\n--- REPAIR ATTEMPT ---\n\n" + r_stderr | |
| self.write_status("iteration_validation", f"re-running tests for iteration {i}", current_iteration=i) | |
| tests_ok, test_logs = self.run_tests() | |
| self.write_status("iteration_validation", f"re-running experiment for iteration {i}", current_iteration=i) | |
| exp_ok, metric_name, metric_after, memory_gb, time_min, exp_logs = self.run_experiment() | |
| improved = bool( | |
| tests_ok | |
| and exp_ok | |
| and metric_after is not None | |
| and before_metric is not None | |
| and metric_after < before_metric | |
| ) | |
| exact_change = codex_summary[:8000] | |
| hypothesis = codex_summary.splitlines()[0] if codex_summary else f"iteration {i}" | |
| merge_sha = None | |
| disposition = "discarded" | |
| if improved: | |
| self.write_status("iteration_merge", f"merging successful iteration {i}", current_iteration=i) | |
| merge_sha = self.merge_branch(branch) | |
| self.best_metric = metric_after | |
| self.metric_name = metric_name | |
| disposition = "merged" | |
| else: | |
| self.write_status("iteration_cleanup", f"discarding iteration {i}", current_iteration=i) | |
| self.delete_branch(branch) | |
| self.log.replace_last_experiment( | |
| ExperimentRecord( | |
| date=utc_now(), | |
| iteration=i, | |
| branch=branch, | |
| name_of_change=hypothesis[:160], | |
| exact_change=exact_change, | |
| hypothesis=hypothesis, | |
| tests_passed=tests_ok, | |
| experiment_ran=exp_ok, | |
| metric_name=metric_name or self.metric_name, | |
| metric_before=before_metric, | |
| metric_after=metric_after, | |
| memory_gb=memory_gb, | |
| time_min=time_min, | |
| improved=improved, | |
| passed_or_discarded=disposition, | |
| codex_stdout=stdout, | |
| codex_stderr=stderr, | |
| test_logs=test_logs, | |
| experiment_logs=exp_logs, | |
| merge_commit=merge_sha, | |
| ) | |
| ) | |
| self.refresh_ui() | |
| self.write_status( | |
| "iteration_complete", | |
| f"iteration {i} complete: {disposition}", | |
| current_iteration=i, | |
| last_error=None if (tests_ok and exp_ok) else "validation_failed_or_no_improvement", | |
| ) | |
| def run(self) -> None: | |
| ensure_clean_worktree(self.repo) | |
| self.checkout_root() | |
| with Live( | |
| self.ui.render(self.log.data), | |
| console=console, | |
| refresh_per_second=6, | |
| screen=True, | |
| auto_refresh=True, | |
| ) as live: | |
| self.live = live | |
| self.write_status("starting", "starting experimenter", bootstrap_done=False) | |
| try: | |
| self.bootstrap_runner() | |
| for i in range(1, self.max_iters + 1): | |
| self.run_iteration(i) | |
| self.write_status("finished", "all iterations finished", finished=True) | |
| time.sleep(0.5) | |
| except Exception as e: | |
| self.write_status("failed", f"fatal error: {e}", last_error=str(e), finished=True) | |
| time.sleep(0.5) | |
| raise | |
| def parse_args() -> argparse.Namespace: | |
| p = argparse.ArgumentParser(description="Headless Codex experimenter meta-agent") | |
| p.add_argument("--repo", required=True, help="Path to git repository") | |
| p.add_argument("--objective", required=True, help="Meta objective") | |
| p.add_argument("--artifact-path", required=True, help="Path to target artifact") | |
| p.add_argument("--test-cmd", required=True, help="Shell command to validate tests") | |
| p.add_argument( | |
| "--experiment-runner-path", | |
| default="experiment_runner.py", | |
| help="Path Codex should create for the single-run experiment harness", | |
| ) | |
| p.add_argument("--max-iters", type=int, default=5, help="Number of experiment iterations") | |
| p.add_argument("--experiment-name", default="meta_experiment", help="Name for the JSON log") | |
| p.add_argument("--model", default=None, help="Optional Codex model override") | |
| return p.parse_args() | |
| def main() -> int: | |
| args = parse_args() | |
| repo = Path(args.repo).resolve() | |
| exp = Experimenter( | |
| repo=repo, | |
| objective=args.objective, | |
| artifact_path=args.artifact_path, | |
| test_cmd=args.test_cmd, | |
| experiment_runner_path=args.experiment_runner_path, | |
| max_iters=args.max_iters, | |
| experiment_name=args.experiment_name, | |
| model=args.model, | |
| ) | |
| exp.run() | |
| console.print(f"\n[bold green]Finished[/] · log written to [cyan]{exp.log.path.name}[/cyan]") | |
| return 0 | |
| if __name__ == "__main__": | |
| sys.exit(main()) |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
python experimenter.py
--repo /path/to/repo
--objective "iteratively improve the model so that the loss function defined in /loss/weirdmse.py decreases"
--artifact-path "/loss/weirdmse.py"
--test-cmd "pytest -q"
--experiment-runner-path "tools/run_one_experiment.py"
--max-iters 20
--experiment-name "weirdmse_optimization"