Skip to content

Instantly share code, notes, and snippets.

@nilayparikh
Created January 6, 2026 22:38
Show Gist options
  • Select an option

  • Save nilayparikh/634f1632eec250340ecba5d77ab07be8 to your computer and use it in GitHub Desktop.

Select an option

Save nilayparikh/634f1632eec250340ecba5d77ab07be8 to your computer and use it in GitHub Desktop.
from __future__ import annotations
import argparse
import datetime as dt
import json
import math
import re
import statistics
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Iterable
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
@dataclass(frozen=True)
class TokenEstimator:
name: str
def count(self, text: str) -> int:
raise NotImplementedError
class TiktokenEstimator(TokenEstimator):
def __init__(self, encoding_name: str = "cl100k_base"):
super().__init__(name=f"tiktoken:{encoding_name}")
import tiktoken # type: ignore
self._enc = tiktoken.get_encoding(encoding_name)
def count(self, text: str) -> int:
if not text:
return 0
return len(self._enc.encode(text))
class HeuristicEstimator(TokenEstimator):
def __init__(self):
super().__init__(name="heuristic:chars/4")
def count(self, text: str) -> int:
if not text:
return 0
# Common rough heuristic: ~4 chars per token for English-ish text.
return max(1, math.ceil(len(text) / 4))
def _parse_iso(ts: str | None) -> dt.datetime | None:
if not ts:
return None
# Example: 2026-01-06T21:38:52.766Z
try:
if ts.endswith("Z"):
ts = ts[:-1] + "+00:00"
return dt.datetime.fromisoformat(ts)
except Exception:
return None
def _safe_filename(value: str) -> str:
value = value.strip()
value = re.sub(r"\s+", " ", value)
value = re.sub(r"[^a-zA-Z0-9._ -]+", "_", value)
value = value.strip(" .-")
return value or "artifact"
def _json_size_bytes(obj: Any) -> int:
try:
return len(json.dumps(obj, ensure_ascii=False).encode("utf-8"))
except Exception:
return 0
def load_chatreplay(path: Path) -> dict[str, Any]:
with path.open("r", encoding="utf-8") as f:
return json.load(f)
def iter_request_logs(chatreplay: dict[str, Any]) -> Iterable[dict[str, Any]]:
prompts = chatreplay.get("prompts") or []
for prompt_idx, prompt in enumerate(prompts):
logs = prompt.get("logs") or []
for log_idx, log in enumerate(logs):
if log.get("kind") != "request":
continue
meta = log.get("metadata") or {}
usage = (meta.get("usage") or {}) if isinstance(meta, dict) else {}
yield {
"prompt_index": prompt_idx,
"log_index": log_idx,
"prompt_text": prompt.get("prompt") or "",
"hasSeen": bool(prompt.get("hasSeen")),
"log_id": log.get("id"),
"log_type": log.get("type"),
"log_name": log.get("name"),
"requestType": meta.get("requestType"),
"model": meta.get("model"),
"maxPromptTokens": meta.get("maxPromptTokens"),
"maxResponseTokens": meta.get("maxResponseTokens"),
"location": meta.get("location"),
"startTime": meta.get("startTime"),
"endTime": meta.get("endTime"),
"duration_ms": meta.get("duration"),
"timeToFirstToken_ms": meta.get("timeToFirstToken"),
"prompt_tokens": usage.get("prompt_tokens"),
"completion_tokens": usage.get("completion_tokens"),
"total_tokens": usage.get("total_tokens"),
"cached_tokens": (usage.get("prompt_tokens_details") or {}).get(
"cached_tokens"
),
"tools_count": len(meta.get("tools") or []) if isinstance(meta, dict) else 0,
"tools_json_bytes": _json_size_bytes(meta.get("tools") or []),
"meta_json_bytes": _json_size_bytes(meta),
}
def compute_prompt_text_features(df: pd.DataFrame, token_estimator: TokenEstimator) -> pd.DataFrame:
text = df["prompt_text"].fillna("").astype(str)
df = df.copy()
df["prompt_chars"] = text.str.len()
df["prompt_lines"] = text.str.count(r"\n") + 1
df["prompt_words"] = text.str.count(r"\S+")
df["prompt_code_fences"] = text.str.count(r"```")
df["prompt_backticks"] = text.str.count(r"`")
df["prompt_bullets"] = text.str.count(r"(?m)^(\s*[-*]|\s*\d+\.)\s+")
df["prompt_filepaths"] = text.str.count(r"(?i)[A-Z]:\\[^\n\r\t]+") + text.str.count(
r"file:///[^\s\)\]]+"
)
df["prompt_imperatives"] = (
text.str.count(r"(?im)\b(must|always|never|ensure|required|required:|do not)\b")
)
df["prompt_token_est"] = [token_estimator.count(t) for t in text.tolist()]
return df
def compute_derived_metrics(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
def _to_num(series: pd.Series) -> pd.Series:
return pd.to_numeric(series, errors="coerce")
for col in [
"maxPromptTokens",
"duration_ms",
"timeToFirstToken_ms",
"prompt_tokens",
"completion_tokens",
"total_tokens",
"cached_tokens",
"tools_count",
"tools_json_bytes",
"meta_json_bytes",
"prompt_token_est",
"prompt_chars",
"prompt_lines",
"prompt_bullets",
"prompt_filepaths",
"prompt_imperatives",
]:
if col in df.columns:
df[col] = _to_num(df[col])
df["start_dt"] = df["startTime"].apply(_parse_iso)
df["end_dt"] = df["endTime"].apply(_parse_iso)
df["cached_ratio"] = df["cached_tokens"] / df["prompt_tokens"]
df["completion_ratio"] = df["completion_tokens"] / df["total_tokens"]
df["context_pressure"] = df["prompt_tokens"] / df["maxPromptTokens"]
# Approximate overhead: prompt_tokens minus estimated tokens in the explicit user prompt string.
# This tends to capture system prompt + tool schemas + hidden context included by Copilot.
df["prompt_overhead_est"] = df["prompt_tokens"] - df["prompt_token_est"]
# Tool schema overhead proxy: the JSON size of tool definitions.
df["tools_bytes_ratio"] = df["tools_json_bytes"] / df["meta_json_bytes"].replace(0, np.nan)
return df
def prompt_level_rollups(chatreplay: dict[str, Any]) -> pd.DataFrame:
prompts = chatreplay.get("prompts") or []
rows: list[dict[str, Any]] = []
for i, p in enumerate(prompts):
text = p.get("prompt") or ""
logs = p.get("logs") or []
rows.append(
{
"prompt_index": i,
"prompt_chars": len(text),
"prompt_lines": text.count("\n") + 1,
"logCount": p.get("logCount"),
"logs_len": len(logs),
"hasSeen": bool(p.get("hasSeen")),
}
)
return pd.DataFrame(rows)
def similarity_matrix(prompt_texts: list[str]) -> tuple[np.ndarray, list[str]]:
cleaned = [t if t.strip() else "(empty)" for t in prompt_texts]
# Keep stopwords; prompts often are instruction-heavy, so stopword removal can hide repetition.
vec = TfidfVectorizer(min_df=1, max_features=5000)
X = vec.fit_transform(cleaned)
sim = cosine_similarity(X)
labels = [f"p{i}" for i in range(len(cleaned))]
return sim, labels
def save_plot(fig: plt.Figure, out_path: Path) -> None:
out_path.parent.mkdir(parents=True, exist_ok=True)
fig.savefig(out_path, dpi=180, bbox_inches="tight")
plt.close(fig)
def plot_all(df: pd.DataFrame, prompts: list[str], out_dir: Path) -> dict[str, Path]:
charts_dir = out_dir / "charts"
charts: dict[str, Path] = {}
sns.set_theme(style="whitegrid")
if not df.empty:
fig, ax = plt.subplots(figsize=(10, 4))
sns.histplot(df["prompt_tokens"].dropna(), bins=30, ax=ax)
ax.set_title("Prompt tokens distribution")
ax.set_xlabel("prompt_tokens")
charts["prompt_tokens_hist"] = charts_dir / "prompt_tokens_hist.png"
save_plot(fig, charts["prompt_tokens_hist"])
fig, ax = plt.subplots(figsize=(7, 5))
sns.scatterplot(
data=df,
x="prompt_tokens",
y="completion_tokens",
hue="model",
ax=ax,
legend=False,
)
ax.set_title("Prompt vs completion tokens")
charts["prompt_vs_completion"] = charts_dir / "prompt_vs_completion.png"
save_plot(fig, charts["prompt_vs_completion"])
df_time = df.dropna(subset=["start_dt"]).sort_values("start_dt")
if not df_time.empty:
fig, ax = plt.subplots(figsize=(10, 4))
ax.plot(df_time["start_dt"], df_time["prompt_tokens"], marker="o", linewidth=1)
ax.set_title("Prompt tokens over time")
ax.set_xlabel("time")
ax.set_ylabel("prompt_tokens")
charts["prompt_tokens_over_time"] = charts_dir / "prompt_tokens_over_time.png"
save_plot(fig, charts["prompt_tokens_over_time"])
fig, ax = plt.subplots(figsize=(10, 4))
ax.plot(
df_time["start_dt"],
df_time["cached_ratio"].fillna(0),
marker="o",
linewidth=1,
)
ax.set_title("Cached token ratio over time")
ax.set_xlabel("time")
ax.set_ylabel("cached_tokens / prompt_tokens")
ax.set_ylim(0, 1)
charts["cached_ratio_over_time"] = charts_dir / "cached_ratio_over_time.png"
save_plot(fig, charts["cached_ratio_over_time"])
fig, ax = plt.subplots(figsize=(10, 4))
ax.plot(
df_time["start_dt"],
df_time["prompt_overhead_est"],
marker="o",
linewidth=1,
)
ax.set_title("Estimated overhead tokens (prompt_tokens - user_prompt_est)")
ax.set_xlabel("time")
ax.set_ylabel("overhead_tokens_est")
charts["overhead_over_time"] = charts_dir / "overhead_over_time.png"
save_plot(fig, charts["overhead_over_time"])
fig, ax = plt.subplots(figsize=(7, 5))
sns.scatterplot(data=df, x="prompt_chars", y="prompt_tokens", hue="model", ax=ax, legend=False)
ax.set_title("User prompt chars vs prompt_tokens")
charts["chars_vs_prompt_tokens"] = charts_dir / "chars_vs_prompt_tokens.png"
save_plot(fig, charts["chars_vs_prompt_tokens"])
fig, ax = plt.subplots(figsize=(10, 4))
top = (
df.sort_values("prompt_tokens", ascending=False)
.head(10)
.assign(label=lambda d: d["prompt_index"].apply(lambda x: f"p{x}"))
)
sns.barplot(data=top, x="label", y="prompt_tokens", ax=ax)
ax.set_title("Top 10 requests by prompt_tokens")
charts["top_prompt_tokens"] = charts_dir / "top_prompt_tokens.png"
save_plot(fig, charts["top_prompt_tokens"])
if prompts:
sim, labels = similarity_matrix(prompts)
fig, ax = plt.subplots(figsize=(7, 6))
sns.heatmap(sim, vmin=0, vmax=1, cmap="viridis", xticklabels=labels, yticklabels=labels, ax=ax)
ax.set_title("Prompt text similarity (TF-IDF cosine)")
charts["prompt_similarity"] = charts_dir / "prompt_similarity.png"
save_plot(fig, charts["prompt_similarity"])
return charts
def summarize_numbers(df: pd.DataFrame) -> dict[str, Any]:
if df.empty:
return {}
def s(col: str) -> pd.Series:
return pd.to_numeric(df[col], errors="coerce").dropna()
summary: dict[str, Any] = {}
for col in ["prompt_tokens", "completion_tokens", "total_tokens", "cached_tokens", "prompt_overhead_est", "duration_ms"]:
values = s(col)
if values.empty:
continue
summary[col] = {
"count": int(values.shape[0]),
"sum": float(values.sum()),
"mean": float(values.mean()),
"p50": float(values.quantile(0.5)),
"p90": float(values.quantile(0.9)),
"p99": float(values.quantile(0.99)) if values.shape[0] >= 10 else float(values.max()),
"max": float(values.max()),
}
ratios = s("cached_ratio")
if not ratios.empty:
summary["cached_ratio"] = {
"mean": float(ratios.mean()),
"p50": float(ratios.quantile(0.5)),
"max": float(ratios.max()),
}
pressure = s("context_pressure")
if not pressure.empty:
summary["context_pressure"] = {
"mean": float(pressure.mean()),
"p90": float(pressure.quantile(0.9)),
"max": float(pressure.max()),
}
return summary
def write_report(
*,
out_dir: Path,
source_path: Path,
token_estimator: TokenEstimator,
chatreplay: dict[str, Any],
df: pd.DataFrame,
prompt_df: pd.DataFrame,
charts: dict[str, Path],
) -> tuple[Path, Path]:
out_dir.mkdir(parents=True, exist_ok=True)
exported_at = chatreplay.get("exportedAt")
total_prompts = chatreplay.get("totalPrompts")
total_logs = chatreplay.get("totalLogEntries")
models = sorted(set(df["model"].dropna().astype(str).tolist())) if not df.empty else []
summary = summarize_numbers(df)
# Identify the worst offenders for context size.
top_overhead = (
df.sort_values("prompt_overhead_est", ascending=False)
.head(8)[["prompt_index", "log_index", "model", "prompt_tokens", "prompt_token_est", "prompt_overhead_est", "cached_tokens", "cached_ratio"]]
if not df.empty
else pd.DataFrame()
)
top_pressure = (
df.sort_values("context_pressure", ascending=False)
.head(8)[["prompt_index", "log_index", "model", "prompt_tokens", "maxPromptTokens", "context_pressure"]]
if not df.empty
else pd.DataFrame()
)
# Pairwise similarity highlights repeated prompts.
prompt_texts = (chatreplay.get("prompts") or [])
prompt_texts = [p.get("prompt") or "" for p in prompt_texts]
sim, labels = similarity_matrix(prompt_texts) if prompt_texts else (None, None)
repeated_pairs: list[tuple[str, str, float]] = []
if sim is not None and labels is not None and len(labels) > 1:
for i in range(len(labels)):
for j in range(i + 1, len(labels)):
repeated_pairs.append((labels[i], labels[j], float(sim[i, j])))
repeated_pairs.sort(key=lambda x: x[2], reverse=True)
repeated_pairs = repeated_pairs[:10]
def img_ref(key: str, title: str) -> str:
p = charts.get(key)
if not p:
return ""
rel = p.relative_to(out_dir).as_posix()
return f"\n\n### {title}\n\n![]({rel})\n"
report_md = out_dir / "report.md"
report_html = out_dir / "report.html"
md = []
md.append("# GitHub Copilot chatreplay: context management analysis")
md.append("")
md.append(f"- Source: `{source_path}`")
md.append(f"- Exported at: `{exported_at}`")
md.append(f"- Total prompts: `{total_prompts}` | Total log entries: `{total_logs}`")
md.append(f"- Token estimator for visible prompt text: `{token_estimator.name}`")
if models:
md.append(f"- Models observed in request metadata: {', '.join(models)}")
md.append("\n## Executive summary")
if summary.get("prompt_tokens"):
md.append(
"- Prompt tokens are dominated by hidden/system/tool context (see overhead chart/table)."
)
md.append(
"- Cached token ratio indicates how much context was resent and reused; high ratios often mean repeated large system/tool payloads."
)
md.append(
"- Prompt similarity heatmap highlights repeated/near-duplicate user prompts (instruction bloat/repetition)."
)
else:
md.append("- No request logs with usage metadata were found.")
md.append("\n## Key metrics")
md.append("```json")
md.append(json.dumps(summary, indent=2))
md.append("```")
md.append("\n## Context overhead (estimated)")
md.append(
"We estimate *visible user prompt tokens* from the exported `prompt` string, then compute:\n"
"\n$\\text{overhead}_\\text{est} = \\text{prompt_tokens} - \\text{visible_prompt_tokens_est}$\n"
"\nThis is not exact (Copilot may add more/less than tool schemas), but it is a strong signal when overhead is orders of magnitude larger than your prompt text."
)
if not top_overhead.empty:
md.append("\nTop requests by estimated overhead:")
md.append(top_overhead.to_markdown(index=False))
md.append(img_ref("overhead_over_time", "Estimated overhead tokens over time"))
md.append(img_ref("chars_vs_prompt_tokens", "Visible prompt size vs prompt_tokens"))
md.append("\n## Cache behavior")
md.append(
"`cached_tokens` comes from `prompt_tokens_details.cached_tokens` when present. High cached ratios typically mean the same large prefix/context is reused across calls."
)
md.append(img_ref("cached_ratio_over_time", "Cached token ratio over time"))
md.append("\n## Token usage charts")
md.append(img_ref("prompt_tokens_hist", "Prompt tokens distribution"))
md.append(img_ref("prompt_tokens_over_time", "Prompt tokens over time"))
md.append(img_ref("prompt_vs_completion", "Prompt vs completion tokens"))
md.append(img_ref("top_prompt_tokens", "Top requests by prompt_tokens"))
md.append("\n## Prompt repetition / instruction bloat signals")
md.append(
"Similarity is computed on the exported prompt texts only (TF-IDF cosine). Values close to 1.0 mean near-duplicates."
)
if repeated_pairs:
md.append("\nTop similar prompt pairs:")
md.append(pd.DataFrame(repeated_pairs, columns=["a", "b", "similarity"]).to_markdown(index=False))
md.append(img_ref("prompt_similarity", "Prompt text similarity heatmap"))
md.append("\n## Context pressure (approaching maxPromptTokens)")
if not top_pressure.empty:
md.append("\nTop requests by context pressure:")
md.append(top_pressure.to_markdown(index=False))
md.append("\n## Practical recommendations (based on the signals above)")
md.append(
"- If overhead dominates, shorten *stable* system instructions and rely on tool metadata being minimal; avoid pasting large tool schemas/specs repeatedly.\n"
"- Prefer referencing files (paths) over embedding entire documents; when you must embed, summarize and include only diffs.\n"
"- If similarity is high across prompts, consolidate repeated instruction blocks into a single short checklist, then only vary the delta per turn.\n"
"- If `context_pressure` is high, you’re at risk of truncation; move long specs out of the prompt and into artifacts the agent can open/read."
)
report_md.write_text("\n".join(md) + "\n", encoding="utf-8")
# Very small HTML wrapper for easier viewing.
html = """<!doctype html>
<html>
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>Copilot chatreplay analysis</title>
<style>
body { font-family: -apple-system, Segoe UI, Roboto, sans-serif; margin: 24px; max-width: 1100px; }
pre { background: #f6f8fa; padding: 12px; overflow-x: auto; }
code { background: #f6f8fa; padding: 2px 4px; }
img { max-width: 100%; height: auto; border: 1px solid #eee; }
table { border-collapse: collapse; }
th, td { border: 1px solid #ddd; padding: 6px 10px; }
</style>
</head>
<body>
<p>Open the Markdown report for the full content:</p>
<p><a href="report.md">report.md</a></p>
</body>
</html>
"""
report_html.write_text(html, encoding="utf-8")
return report_md, report_html
def build_artifacts(
*,
input_path: Path,
out_dir: Path,
token_estimator: TokenEstimator,
) -> tuple[pd.DataFrame, pd.DataFrame, dict[str, Path], Path, Path]:
chatreplay = load_chatreplay(input_path)
rows = list(iter_request_logs(chatreplay))
df = pd.DataFrame(rows)
if not df.empty:
df = compute_prompt_text_features(df, token_estimator)
df = compute_derived_metrics(df)
prompt_df = prompt_level_rollups(chatreplay)
prompts = [p.get("prompt") or "" for p in (chatreplay.get("prompts") or [])]
charts = plot_all(df, prompts, out_dir)
# Write raw tables for further slicing.
tables_dir = out_dir / "tables"
tables_dir.mkdir(parents=True, exist_ok=True)
df.to_csv(tables_dir / "requests.csv", index=False)
prompt_df.to_csv(tables_dir / "prompts.csv", index=False)
report_md, report_html = write_report(
out_dir=out_dir,
source_path=input_path,
token_estimator=token_estimator,
chatreplay=chatreplay,
df=df,
prompt_df=prompt_df,
charts=charts,
)
return df, prompt_df, charts, report_md, report_html
def choose_token_estimator(prefer_tiktoken: bool = True) -> TokenEstimator:
if prefer_tiktoken:
try:
return TiktokenEstimator("cl100k_base")
except Exception:
return HeuristicEstimator()
return HeuristicEstimator()
def main(argv: list[str] | None = None) -> int:
parser = argparse.ArgumentParser(
description="Analyze GitHub Copilot chatreplay JSON for context/token management patterns and generate charts + report."
)
parser.add_argument(
"--input",
required=True,
help="Path to *.chatreplay.json",
)
parser.add_argument(
"--out",
default=None,
help="Output directory (default: ./agentic_sdlc/.output/<timestamp>/)",
)
parser.add_argument(
"--no-tiktoken",
action="store_true",
help="Disable tiktoken and use a heuristic token estimator",
)
args = parser.parse_args(argv)
input_path = Path(args.input)
if not input_path.exists():
raise SystemExit(f"Input file not found: {input_path}")
stamp = dt.datetime.now().strftime("%Y%m%d_%H%M%S")
out_dir = Path(args.out) if args.out else Path("agentic_sdlc") / ".output" / stamp
out_dir.mkdir(parents=True, exist_ok=True)
token_estimator = choose_token_estimator(prefer_tiktoken=not args.no_tiktoken)
_, _, _, report_md, report_html = build_artifacts(
input_path=input_path,
out_dir=out_dir,
token_estimator=token_estimator,
)
print(str(report_md))
print(str(report_html))
return 0
if __name__ == "__main__":
raise SystemExit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment