albertbuchard/Experiment_runner.py

## Experiment_runner.py
#!/usr/bin/env python3
from __future__ import annotations

import argparse
import datetime as dt
import json
import shlex
import subprocess
import sys
import time
import uuid
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

from rich.columns import Columns
from rich.console import Console, Group
from rich.live import Live
from rich.panel import Panel
from rich.text import Text


console = Console(color_system="truecolor")


def utc_now() -> str:
    return dt.datetime.utcnow().replace(microsecond=0).isoformat() + "Z"


def slugify(s: str) -> str:
    out = []
    for ch in s.lower():
        if ch.isalnum():
            out.append(ch)
        elif ch in (" ", "-", "_", "/", "."):
            out.append("-")
    text = "".join(out)
    while "--" in text:
        text = text.replace("--", "-")
    return text.strip("-") or "experiment"


def run(
    cmd: List[str],
    cwd: Optional[Path] = None,
    check: bool = True,
    capture: bool = True,
) -> subprocess.CompletedProcess:
    return subprocess.run(
        cmd,
        cwd=str(cwd) if cwd else None,
        check=check,
        text=True,
        capture_output=capture,
    )


def run_shell(command: str, cwd: Optional[Path] = None, check: bool = True) -> subprocess.CompletedProcess:
    return subprocess.run(
        command,
        cwd=str(cwd) if cwd else None,
        shell=True,
        check=check,
        text=True,
        capture_output=True,
        executable="/bin/bash",
    )


def git(repo: Path, *args: str, check: bool = True) -> subprocess.CompletedProcess:
    return run(["git", *args], cwd=repo, check=check)


def current_branch(repo: Path) -> str:
    return git(repo, "rev-parse", "--abbrev-ref", "HEAD").stdout.strip()


def ensure_clean_worktree(repo: Path) -> None:
    status = git(repo, "status", "--porcelain").stdout.strip()
    if status:
        raise RuntimeError("Working tree is not clean. Commit or stash changes before running.")


def file_exists(repo: Path, relpath: str) -> bool:
    return (repo / relpath).exists()


def safe_json_loads(text: str) -> Optional[Any]:
    try:
        return json.loads(text)
    except Exception:
        return None


def extract_json_objects_from_jsonl(text: str) -> List[dict]:
    objs: List[dict] = []
    for line in text.splitlines():
        line = line.strip()
        if not line:
            continue
        obj = safe_json_loads(line)
        if isinstance(obj, dict):
            objs.append(obj)
    return objs


@dataclass
class ExperimentRecord:
    date: str
    iteration: int
    branch: str
    name_of_change: str
    exact_change: str
    hypothesis: str
    tests_passed: bool
    experiment_ran: bool
    metric_name: Optional[str]
    metric_before: Optional[float]
    metric_after: Optional[float]
    memory_gb: Optional[float]
    time_min: Optional[float]
    improved: bool
    passed_or_discarded: str
    codex_stdout: str
    codex_stderr: str
    test_logs: str
    experiment_logs: str
    merge_commit: Optional[str]


class JsonStateLog:
    def __init__(self, path: Path, experiment_name: str, root_branch: str):
        self.path = path
        self.data: Dict[str, Any] = {
            "experiment_name": experiment_name,
            "uuid": path.stem.split("_")[-2] if "_" in path.stem else str(uuid.uuid4()),
            "created_at": utc_now(),
            "root_branch": root_branch,
            "status": {
                "phase": "initialized",
                "current_iteration": 0,
                "current_branch": root_branch,
                "best_metric": None,
                "best_metric_name": None,
                "bootstrap_done": False,
                "last_error": None,
                "finished": False,
                "current_step_message": "starting",
            },
            "experiments": [],
        }
        self.flush()

    def set_status(self, **kwargs: Any) -> None:
        self.data["status"].update(kwargs)
        self.flush()

    def add_experiment(self, rec: ExperimentRecord) -> None:
        self.data["experiments"].append(asdict(rec))
        self.flush()

    def replace_last_experiment(self, rec: ExperimentRecord) -> None:
        if self.data["experiments"]:
            self.data["experiments"][-1] = asdict(rec)
        else:
            self.data["experiments"].append(asdict(rec))
        self.flush()

    def flush(self) -> None:
        self.path.write_text(json.dumps(self.data, indent=2), encoding="utf-8")


class ExperimentUI:
    def __init__(self, title: str):
        self.title = title

    def _status_style(self, status: str) -> str:
        s = (status or "").lower()
        if s == "merged":
            return "bold green"
        if s == "discarded":
            return "bold yellow"
        if s == "failed":
            return "bold red"
        if s == "running":
            return "bold cyan"
        return "white"

    def _status_label(self, status: str) -> str:
        s = (status or "").lower()
        if s == "merged":
            return "PASSED"
        if s == "discarded":
            return "DISCARDED"
        if s == "failed":
            return "FAILED"
        if s == "running":
            return "RUNNING"
        return status.upper() or "UNKNOWN"

    def _short_commitish(self, exp: dict) -> str:
        merge_commit = exp.get("merge_commit")
        if isinstance(merge_commit, str) and merge_commit.strip():
            return merge_commit[:7]

        codex_stdout = exp.get("codex_stdout", "")
        for line in codex_stdout.splitlines():
            if "commit" in line.lower():
                parts = line.strip().split()
                for part in parts:
                    if len(part) >= 7 and all(c in "0123456789abcdef" for c in part[:7].lower()):
                        return part[:7]

        branch = exp.get("branch", "")
        tail = branch.split("-")[-1] if branch else ""
        return (tail or "-------")[:7]

    def _format_metric_triplet(self, exp: dict) -> Text:
        commit = self._short_commitish(exp)

        memory_gb = exp.get("memory_gb")
        time_min = exp.get("time_min")
        metric_after = exp.get("metric_after")

        mem_s = f"{memory_gb:.1f}" if isinstance(memory_gb, (int, float)) else "-"
        time_s = f"{time_min:.2f}" if isinstance(time_min, (int, float)) else "-"
        metric_s = f"{metric_after:.6f}" if isinstance(metric_after, (int, float)) else "-"

        desc = exp.get("name_of_change", "") or exp.get("hypothesis", "") or "-"
        desc = desc.replace("\n", " ").strip()
        if len(desc) > 72:
            desc = desc[:69] + "..."

        status = self._status_label(exp.get("passed_or_discarded", ""))
        status_style = self._status_style(exp.get("passed_or_discarded", ""))

        line = Text()
        line.append(f"{commit},", style="white")
        line.append(f"{mem_s},", style="bright_white")
        line.append(f"{time_s},", style="bright_white")
        line.append(f"{metric_s},", style="bold white")
        line.append(desc, style="white")
        line.append(" — ", style="dim")
        line.append(status, style=status_style)
        return line

    def _format_row(self, idx: int, exp: dict) -> Text:
        t = Text()
        t.append(f"{idx:>3} ", style="dim")
        t.append_text(self._format_metric_triplet(exp))
        return t

    def _build_history_columns(self, data: dict) -> Columns:
        experiments = data.get("experiments", [])

        left = Text()
        right = Text()

        header = Text("  commit,memory_gb,time_min,val_bpb,description\n", style="dim")
        left.append_text(header)
        right.append_text(header)

        if not experiments:
            left.append("  no experiments yet\n", style="dim")
            return Columns([left, right], expand=True, equal=True)

        rows = [self._format_row(i + 1, exp) for i, exp in enumerate(experiments)]
        split = (len(rows) + 1) // 2
        left_rows = rows[:split]
        right_rows = rows[split:]

        for r in left_rows:
            left.append_text(r)
            left.append("\n")

        for r in right_rows:
            right.append_text(r)
            right.append("\n")

        return Columns([left, right], expand=True, equal=True)

    def _summary_line(self, data: dict) -> Text:
        exps = data.get("experiments", [])
        merged = sum(1 for e in exps if e.get("passed_or_discarded") == "merged")
        discarded = sum(1 for e in exps if e.get("passed_or_discarded") == "discarded")
        failed = sum(1 for e in exps if e.get("passed_or_discarded") == "failed")
        running = sum(1 for e in exps if e.get("passed_or_discarded") == "running")

        status = data.get("status", {})
        best_metric = status.get("best_metric")
        best_metric_name = status.get("best_metric_name") or "metric"

        txt = Text()
        txt.append("passed ", style="green")
        txt.append(str(merged), style="bold green")
        txt.append("   discarded ", style="yellow")
        txt.append(str(discarded), style="bold yellow")
        txt.append("   failed ", style="red")
        txt.append(str(failed), style="bold red")
        txt.append("   running ", style="cyan")
        txt.append(str(running), style="bold cyan")
        txt.append("   best ", style="white")
        if isinstance(best_metric, (int, float)):
            txt.append(f"{best_metric_name}={best_metric:.6f}", style="bold white")
        else:
            txt.append("-", style="dim")
        return txt

    def _status_block(self, data: dict) -> Panel:
        status = data.get("status", {})
        root_branch = data.get("root_branch", "?")
        current_branch = status.get("current_branch", "?")
        current_iteration = status.get("current_iteration", 0)
        phase = status.get("phase", "?")
        msg = status.get("current_step_message", "-")
        last_error = status.get("last_error")

        body = Text()
        body.append("status\n", style="bold white")
        body.append(f"root_branch:     {root_branch}\n", style="white")
        body.append(f"current_branch:  {current_branch}\n", style="white")
        body.append(f"iteration:       {current_iteration}\n", style="white")
        body.append(f"phase:           {phase}\n", style="white")
        body.append(f"now:             {msg}\n", style="bold cyan")
        if last_error:
            body.append(f"last_error:      {last_error}\n", style="bold red")

        return Panel(body, border_style="white", title="Current status")

    def render(self, data: dict):
        history = self._build_history_columns(data)
        summary = self._summary_line(data)
        status = self._status_block(data)

        return Group(
            Panel(history, title=self.title, border_style="white"),
            Panel(summary, border_style="white"),
            status,
        )


class CodexAgent:
    def __init__(self, repo: Path, model: Optional[str] = None):
        self.repo = repo
        self.model = model

    def exec(self, prompt: str, full_auto: bool = True, extra_args: Optional[List[str]] = None) -> Tuple[str, str, int]:
        cmd = ["codex", "exec", "--json"]
        if full_auto:
            cmd.append("--full-auto")
        if self.model:
            cmd.extend(["--model", self.model])
        if extra_args:
            cmd.extend(extra_args)
        cmd.append(prompt)

        proc = subprocess.run(
            cmd,
            cwd=str(self.repo),
            text=True,
            capture_output=True,
        )
        return proc.stdout, proc.stderr, proc.returncode

    @staticmethod
    def summarize_jsonl_output(stdout: str) -> str:
        objs = extract_json_objects_from_jsonl(stdout)
        parts: List[str] = []
        for obj in objs:
            for key in ("message", "content", "text", "summary"):
                val = obj.get(key)
                if isinstance(val, str) and val.strip():
                    parts.append(val.strip())
        return "\n".join(parts).strip() or stdout.strip()


class Experimenter:
    def __init__(
        self,
        repo: Path,
        objective: str,
        artifact_path: str,
        test_cmd: str,
        experiment_runner_path: str,
        max_iters: int,
        experiment_name: str,
        model: Optional[str] = None,
    ):
        self.repo = repo
        self.objective = objective
        self.artifact_path = artifact_path
        self.test_cmd = test_cmd
        self.experiment_runner_path = experiment_runner_path
        self.max_iters = max_iters
        self.root_branch = current_branch(repo)
        self.agent = CodexAgent(repo, model=model)
        self.ui = ExperimentUI(f"Experimenter · {experiment_name}")

        run_id = str(uuid.uuid4())[:8]
        log_name = f"{slugify(experiment_name)}_{run_id}_logs.json"
        self.log = JsonStateLog(repo / log_name, experiment_name, self.root_branch)

        self.best_metric: Optional[float] = None
        self.metric_name: Optional[str] = None
        self.live: Optional[Live] = None

    def refresh_ui(self) -> None:
        if self.live is not None:
            self.live.update(self.ui.render(self.log.data), refresh=True)

    def write_status(self, phase: str, current_step_message: Optional[str] = None, **extra: Any) -> None:
        payload = dict(
            phase=phase,
            current_branch=current_branch(self.repo),
            best_metric=self.best_metric,
            best_metric_name=self.metric_name,
            **extra,
        )
        if current_step_message is not None:
            payload["current_step_message"] = current_step_message
        self.log.set_status(**payload)
        self.refresh_ui()

    def checkout_root(self) -> None:
        git(self.repo, "checkout", self.root_branch)

    def create_branch(self, name: str) -> None:
        git(self.repo, "checkout", "-B", name)

    def delete_branch(self, name: str) -> None:
        self.checkout_root()
        git(self.repo, "branch", "-D", name, check=False)

    def merge_branch(self, name: str) -> Optional[str]:
        self.checkout_root()
        git(self.repo, "merge", "--no-ff", name, "-m", f"Merge {name} from experimenter")
        merge_sha = git(self.repo, "rev-parse", "HEAD").stdout.strip()
        git(self.repo, "branch", "-D", name, check=False)
        return merge_sha

    def run_tests(self) -> Tuple[bool, str]:
        proc = run_shell(self.test_cmd, cwd=self.repo, check=False)
        ok = proc.returncode == 0
        logs = f"$ {self.test_cmd}\n\nSTDOUT:\n{proc.stdout}\n\nSTDERR:\n{proc.stderr}"
        return ok, logs

    def run_experiment(
        self,
    ) -> Tuple[bool, Optional[str], Optional[float], Optional[float], Optional[float], str]:
        cmd = f"python {shlex.quote(self.experiment_runner_path)}"
        proc = run_shell(cmd, cwd=self.repo, check=False)
        logs = f"$ {cmd}\n\nSTDOUT:\n{proc.stdout}\n\nSTDERR:\n{proc.stderr}"

        if proc.returncode != 0:
            return False, None, None, None, None, logs

        parsed = safe_json_loads(proc.stdout.strip())
        if not isinstance(parsed, dict):
            return False, None, None, None, None, logs + "\n\nRunner output was not valid JSON."

        metric_name = parsed.get("metric_name")
        metric_value = parsed.get("metric_value")
        memory_gb = parsed.get("memory_gb")
        time_min = parsed.get("time_min")

        if not isinstance(metric_name, str):
            return False, None, None, None, None, logs + "\n\nMissing metric_name."

        try:
            metric_value = float(metric_value)
        except Exception:
            return False, None, None, None, None, logs + "\n\nmetric_value was not numeric."

        try:
            memory_gb = float(memory_gb) if memory_gb is not None else None
        except Exception:
            memory_gb = None

        try:
            time_min = float(time_min) if time_min is not None else None
        except Exception:
            time_min = None

        return True, metric_name, metric_value, memory_gb, time_min, logs

    def add_running_placeholder(self, iteration: int, branch: str, name: str) -> None:
        self.log.add_experiment(
            ExperimentRecord(
                date=utc_now(),
                iteration=iteration,
                branch=branch,
                name_of_change=name,
                exact_change="",
                hypothesis="",
                tests_passed=False,
                experiment_ran=False,
                metric_name=self.metric_name,
                metric_before=self.best_metric,
                metric_after=None,
                memory_gb=None,
                time_min=None,
                improved=False,
                passed_or_discarded="running",
                codex_stdout="",
                codex_stderr="",
                test_logs="",
                experiment_logs="",
                merge_commit=None,
            )
        )
        self.refresh_ui()

    def bootstrap_runner(self) -> None:
        self.write_status("bootstrap_runner", "bootstrapping experiment runner")

        branch = f"exp/bootstrap-runner-{str(uuid.uuid4())[:8]}"
        self.create_branch(branch)
        self.add_running_placeholder(0, branch, "bootstrap experiment runner")

        prompt = f"""
You are building the bootstrap experiment runner for an autonomous experiment loop.

Repository root: {self.repo}
Target artifact to optimize: {self.artifact_path}
Objective: {self.objective}

Create exactly one headless runner script at:
{self.experiment_runner_path}

Requirements:
1. The script must run one experiment end-to-end for the current repository state.
2. It must print ONLY valid JSON to stdout in the form:
   {{"metric_name": "<name>", "metric_value": <float>, "memory_gb": <float|null>, "time_min": <float|null>, "extra": {{...}}}}
3. The metric must be the optimized metric implied by:
   "{self.objective}"
4. Add any minimal helper code/config needed.
5. If there are tests, update or add a small smoke test if appropriate.
6. Commit your work locally on this branch with a clear git commit message.
7. Do not ask questions. Make reasonable assumptions and implement the smallest robust version.
"""

        stdout, stderr, rc = self.agent.exec(prompt)
        if rc != 0:
            self.delete_branch(branch)
            self.log.replace_last_experiment(
                ExperimentRecord(
                    date=utc_now(),
                    iteration=0,
                    branch=branch,
                    name_of_change="bootstrap experiment runner",
                    exact_change="codex bootstrap failed",
                    hypothesis="Need a reproducible single-run experiment harness.",
                    tests_passed=False,
                    experiment_ran=False,
                    metric_name=None,
                    metric_before=None,
                    metric_after=None,
                    memory_gb=None,
                    time_min=None,
                    improved=False,
                    passed_or_discarded="failed",
                    codex_stdout=stdout,
                    codex_stderr=stderr,
                    test_logs="",
                    experiment_logs="",
                    merge_commit=None,
                )
            )
            self.refresh_ui()
            raise RuntimeError("Codex bootstrap failed.")

        if not file_exists(self.repo, self.experiment_runner_path):
            self.delete_branch(branch)
            raise RuntimeError(f"Bootstrap did not create {self.experiment_runner_path}")

        self.write_status("bootstrap_runner", "validating bootstrap tests")
        tests_ok, test_logs = self.run_tests()

        self.write_status("bootstrap_runner", "running bootstrap experiment")
        exp_ok, metric_name, metric_value, memory_gb, time_min, exp_logs = self.run_experiment()

        if not tests_ok or not exp_ok:
            self.delete_branch(branch)
            self.log.replace_last_experiment(
                ExperimentRecord(
                    date=utc_now(),
                    iteration=0,
                    branch=branch,
                    name_of_change="bootstrap experiment runner",
                    exact_change=f"Created {self.experiment_runner_path} but validation failed.",
                    hypothesis="Need a reproducible single-run experiment harness.",
                    tests_passed=tests_ok,
                    experiment_ran=exp_ok,
                    metric_name=metric_name,
                    metric_before=None,
                    metric_after=metric_value,
                    memory_gb=memory_gb,
                    time_min=time_min,
                    improved=False,
                    passed_or_discarded="failed",
                    codex_stdout=stdout,
                    codex_stderr=stderr,
                    test_logs=test_logs,
                    experiment_logs=exp_logs,
                    merge_commit=None,
                )
            )
            self.refresh_ui()
            raise RuntimeError("Bootstrap runner branch failed validation.")

        self.write_status("bootstrap_runner", "merging bootstrap branch")
        merge_sha = self.merge_branch(branch)
        self.best_metric = metric_value
        self.metric_name = metric_name

        self.log.replace_last_experiment(
            ExperimentRecord(
                date=utc_now(),
                iteration=0,
                branch=branch,
                name_of_change="bootstrap experiment runner",
                exact_change=f"Created {self.experiment_runner_path} and minimal support code.",
                hypothesis="Need a reproducible single-run experiment harness before iterative optimization.",
                tests_passed=tests_ok,
                experiment_ran=exp_ok,
                metric_name=metric_name,
                metric_before=None,
                metric_after=metric_value,
                memory_gb=memory_gb,
                time_min=time_min,
                improved=True,
                passed_or_discarded="merged",
                codex_stdout=stdout,
                codex_stderr=stderr,
                test_logs=test_logs,
                experiment_logs=exp_logs,
                merge_commit=merge_sha,
            )
        )
        self.refresh_ui()

        self.write_status("bootstrap_complete", "bootstrap finished", bootstrap_done=True)

    def run_iteration(self, i: int) -> None:
        self.write_status("iteration_start", f"starting iteration {i}", current_iteration=i)

        branch = f"exp/{slugify(self.metric_name or 'metric')}-iter-{i}-{str(uuid.uuid4())[:8]}"
        self.create_branch(branch)
        before_metric = self.best_metric
        self.add_running_placeholder(i, branch, f"iteration {i} hypothesis search")

        prompt = f"""
You are one iteration of an autonomous ML/code experimenter.

Root branch: {self.root_branch}
Current branch: {branch}
Optimization objective: {self.objective}
Target artifact: {self.artifact_path}
Experiment runner: {self.experiment_runner_path}
Current best metric ({self.metric_name}): {self.best_metric}

Your job:
1. Form one concrete hypothesis for improving the metric.
2. Implement the smallest useful change.
3. Add or update tests or validation as needed.
4. Commit your changes on this branch with a clear commit message.

Hard requirements:
- Work headlessly.
- Do not ask questions.
- Favor small, reversible changes.
- Ensure the codebase remains runnable.
- Do not merge branches yourself.
- Assume lower metric is better.
- Return enough detail in your final response to describe:
  - hypothesis
  - exact files changed
  - exact nature of the change

Important:
- The outer orchestrator will run tests and the experiment runner after you finish.
- If you notice likely runtime issues, proactively fix them before finishing.
"""

        self.write_status("iteration_coding", f"codex implementing iteration {i}", current_iteration=i)
        stdout, stderr, rc = self.agent.exec(prompt)
        codex_summary = self.agent.summarize_jsonl_output(stdout)

        if rc != 0:
            self.delete_branch(branch)
            self.log.replace_last_experiment(
                ExperimentRecord(
                    date=utc_now(),
                    iteration=i,
                    branch=branch,
                    name_of_change=f"iteration {i} failed before validation",
                    exact_change="",
                    hypothesis="",
                    tests_passed=False,
                    experiment_ran=False,
                    metric_name=self.metric_name,
                    metric_before=before_metric,
                    metric_after=None,
                    memory_gb=None,
                    time_min=None,
                    improved=False,
                    passed_or_discarded="failed",
                    codex_stdout=stdout,
                    codex_stderr=stderr,
                    test_logs="",
                    experiment_logs="",
                    merge_commit=None,
                )
            )
            self.refresh_ui()
            return

        self.write_status("iteration_validation", f"running tests for iteration {i}", current_iteration=i)
        tests_ok, test_logs = self.run_tests()

        self.write_status("iteration_validation", f"running experiment for iteration {i}", current_iteration=i)
        exp_ok, metric_name, metric_after, memory_gb, time_min, exp_logs = self.run_experiment()

        if not tests_ok or not exp_ok:
            repair_prompt = f"""
The previous implementation on this branch did not validate.

Please repair the branch until:
1. tests pass using:
   {self.test_cmd}
2. the experiment runner succeeds:
   python {self.experiment_runner_path}

Do not change the overall goal.
Commit any repair changes with a clear commit message.
"""
            self.write_status("iteration_repair", f"repairing iteration {i}", current_iteration=i)
            r_stdout, r_stderr, _ = self.agent.exec(repair_prompt)
            stdout += "\n\n--- REPAIR ATTEMPT ---\n\n" + r_stdout
            stderr += "\n\n--- REPAIR ATTEMPT ---\n\n" + r_stderr

            self.write_status("iteration_validation", f"re-running tests for iteration {i}", current_iteration=i)
            tests_ok, test_logs = self.run_tests()

            self.write_status("iteration_validation", f"re-running experiment for iteration {i}", current_iteration=i)
            exp_ok, metric_name, metric_after, memory_gb, time_min, exp_logs = self.run_experiment()

        improved = bool(
            tests_ok
            and exp_ok
            and metric_after is not None
            and before_metric is not None
            and metric_after < before_metric
        )

        exact_change = codex_summary[:8000]
        hypothesis = codex_summary.splitlines()[0] if codex_summary else f"iteration {i}"

        merge_sha = None
        disposition = "discarded"

        if improved:
            self.write_status("iteration_merge", f"merging successful iteration {i}", current_iteration=i)
            merge_sha = self.merge_branch(branch)
            self.best_metric = metric_after
            self.metric_name = metric_name
            disposition = "merged"
        else:
            self.write_status("iteration_cleanup", f"discarding iteration {i}", current_iteration=i)
            self.delete_branch(branch)

        self.log.replace_last_experiment(
            ExperimentRecord(
                date=utc_now(),
                iteration=i,
                branch=branch,
                name_of_change=hypothesis[:160],
                exact_change=exact_change,
                hypothesis=hypothesis,
                tests_passed=tests_ok,
                experiment_ran=exp_ok,
                metric_name=metric_name or self.metric_name,
                metric_before=before_metric,
                metric_after=metric_after,
                memory_gb=memory_gb,
                time_min=time_min,
                improved=improved,
                passed_or_discarded=disposition,
                codex_stdout=stdout,
                codex_stderr=stderr,
                test_logs=test_logs,
                experiment_logs=exp_logs,
                merge_commit=merge_sha,
            )
        )
        self.refresh_ui()

        self.write_status(
            "iteration_complete",
            f"iteration {i} complete: {disposition}",
            current_iteration=i,
            last_error=None if (tests_ok and exp_ok) else "validation_failed_or_no_improvement",
        )

    def run(self) -> None:
        ensure_clean_worktree(self.repo)
        self.checkout_root()

        with Live(
            self.ui.render(self.log.data),
            console=console,
            refresh_per_second=6,
            screen=True,
            auto_refresh=True,
        ) as live:
            self.live = live
            self.write_status("starting", "starting experimenter", bootstrap_done=False)

            try:
                self.bootstrap_runner()
                for i in range(1, self.max_iters + 1):
                    self.run_iteration(i)

                self.write_status("finished", "all iterations finished", finished=True)
                time.sleep(0.5)
            except Exception as e:
                self.write_status("failed", f"fatal error: {e}", last_error=str(e), finished=True)
                time.sleep(0.5)
                raise


def parse_args() -> argparse.Namespace:
    p = argparse.ArgumentParser(description="Headless Codex experimenter meta-agent")
    p.add_argument("--repo", required=True, help="Path to git repository")
    p.add_argument("--objective", required=True, help="Meta objective")
    p.add_argument("--artifact-path", required=True, help="Path to target artifact")
    p.add_argument("--test-cmd", required=True, help="Shell command to validate tests")
    p.add_argument(
        "--experiment-runner-path",
        default="experiment_runner.py",
        help="Path Codex should create for the single-run experiment harness",
    )
    p.add_argument("--max-iters", type=int, default=5, help="Number of experiment iterations")
    p.add_argument("--experiment-name", default="meta_experiment", help="Name for the JSON log")
    p.add_argument("--model", default=None, help="Optional Codex model override")
    return p.parse_args()


def main() -> int:
    args = parse_args()
    repo = Path(args.repo).resolve()

    exp = Experimenter(
        repo=repo,
        objective=args.objective,
        artifact_path=args.artifact_path,
        test_cmd=args.test_cmd,
        experiment_runner_path=args.experiment_runner_path,
        max_iters=args.max_iters,
        experiment_name=args.experiment_name,
        model=args.model,
    )
    exp.run()
    console.print(f"\n[bold green]Finished[/] · log written to [cyan]{exp.log.path.name}[/cyan]")
    return 0


if __name__ == "__main__":
    sys.exit(main())
No results found