nilayparikh/gh_copilot_context_analysis.py

## gh_copilot_context_analysis.py
from __future__ import annotations

import argparse
import datetime as dt
import json
import math
import re
import statistics
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Iterable

import matplotlib

matplotlib.use("Agg")

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


@dataclass(frozen=True)
class TokenEstimator:
	name: str

	def count(self, text: str) -> int:
		raise NotImplementedError


class TiktokenEstimator(TokenEstimator):
	def __init__(self, encoding_name: str = "cl100k_base"):
		super().__init__(name=f"tiktoken:{encoding_name}")
		import tiktoken  # type: ignore

		self._enc = tiktoken.get_encoding(encoding_name)

	def count(self, text: str) -> int:
		if not text:
			return 0
		return len(self._enc.encode(text))


class HeuristicEstimator(TokenEstimator):
	def __init__(self):
		super().__init__(name="heuristic:chars/4")

	def count(self, text: str) -> int:
		if not text:
			return 0
		# Common rough heuristic: ~4 chars per token for English-ish text.
		return max(1, math.ceil(len(text) / 4))


def _parse_iso(ts: str | None) -> dt.datetime | None:
	if not ts:
		return None
	# Example: 2026-01-06T21:38:52.766Z
	try:
		if ts.endswith("Z"):
			ts = ts[:-1] + "+00:00"
		return dt.datetime.fromisoformat(ts)
	except Exception:
		return None


def _safe_filename(value: str) -> str:
	value = value.strip()
	value = re.sub(r"\s+", " ", value)
	value = re.sub(r"[^a-zA-Z0-9._ -]+", "_", value)
	value = value.strip(" .-")
	return value or "artifact"


def _json_size_bytes(obj: Any) -> int:
	try:
		return len(json.dumps(obj, ensure_ascii=False).encode("utf-8"))
	except Exception:
		return 0


def load_chatreplay(path: Path) -> dict[str, Any]:
	with path.open("r", encoding="utf-8") as f:
		return json.load(f)


def iter_request_logs(chatreplay: dict[str, Any]) -> Iterable[dict[str, Any]]:
	prompts = chatreplay.get("prompts") or []
	for prompt_idx, prompt in enumerate(prompts):
		logs = prompt.get("logs") or []
		for log_idx, log in enumerate(logs):
			if log.get("kind") != "request":
				continue
			meta = log.get("metadata") or {}
			usage = (meta.get("usage") or {}) if isinstance(meta, dict) else {}
			yield {
				"prompt_index": prompt_idx,
				"log_index": log_idx,
				"prompt_text": prompt.get("prompt") or "",
				"hasSeen": bool(prompt.get("hasSeen")),
				"log_id": log.get("id"),
				"log_type": log.get("type"),
				"log_name": log.get("name"),
				"requestType": meta.get("requestType"),
				"model": meta.get("model"),
				"maxPromptTokens": meta.get("maxPromptTokens"),
				"maxResponseTokens": meta.get("maxResponseTokens"),
				"location": meta.get("location"),
				"startTime": meta.get("startTime"),
				"endTime": meta.get("endTime"),
				"duration_ms": meta.get("duration"),
				"timeToFirstToken_ms": meta.get("timeToFirstToken"),
				"prompt_tokens": usage.get("prompt_tokens"),
				"completion_tokens": usage.get("completion_tokens"),
				"total_tokens": usage.get("total_tokens"),
				"cached_tokens": (usage.get("prompt_tokens_details") or {}).get(
					"cached_tokens"
				),
				"tools_count": len(meta.get("tools") or []) if isinstance(meta, dict) else 0,
				"tools_json_bytes": _json_size_bytes(meta.get("tools") or []),
				"meta_json_bytes": _json_size_bytes(meta),
			}


def compute_prompt_text_features(df: pd.DataFrame, token_estimator: TokenEstimator) -> pd.DataFrame:
	text = df["prompt_text"].fillna("").astype(str)
	df = df.copy()
	df["prompt_chars"] = text.str.len()
	df["prompt_lines"] = text.str.count(r"\n") + 1
	df["prompt_words"] = text.str.count(r"\S+")
	df["prompt_code_fences"] = text.str.count(r"```")
	df["prompt_backticks"] = text.str.count(r"`")
	df["prompt_bullets"] = text.str.count(r"(?m)^(\s*[-*]|\s*\d+\.)\s+")
	df["prompt_filepaths"] = text.str.count(r"(?i)[A-Z]:\\[^\n\r\t]+") + text.str.count(
		r"file:///[^\s\)\]]+"
	)
	df["prompt_imperatives"] = (
		text.str.count(r"(?im)\b(must|always|never|ensure|required|required:|do not)\b")
	)
	df["prompt_token_est"] = [token_estimator.count(t) for t in text.tolist()]
	return df


def compute_derived_metrics(df: pd.DataFrame) -> pd.DataFrame:
	df = df.copy()

	def _to_num(series: pd.Series) -> pd.Series:
		return pd.to_numeric(series, errors="coerce")

	for col in [
		"maxPromptTokens",
		"duration_ms",
		"timeToFirstToken_ms",
		"prompt_tokens",
		"completion_tokens",
		"total_tokens",
		"cached_tokens",
		"tools_count",
		"tools_json_bytes",
		"meta_json_bytes",
		"prompt_token_est",
		"prompt_chars",
		"prompt_lines",
		"prompt_bullets",
		"prompt_filepaths",
		"prompt_imperatives",
	]:
		if col in df.columns:
			df[col] = _to_num(df[col])

	df["start_dt"] = df["startTime"].apply(_parse_iso)
	df["end_dt"] = df["endTime"].apply(_parse_iso)

	df["cached_ratio"] = df["cached_tokens"] / df["prompt_tokens"]
	df["completion_ratio"] = df["completion_tokens"] / df["total_tokens"]
	df["context_pressure"] = df["prompt_tokens"] / df["maxPromptTokens"]

	# Approximate overhead: prompt_tokens minus estimated tokens in the explicit user prompt string.
	# This tends to capture system prompt + tool schemas + hidden context included by Copilot.
	df["prompt_overhead_est"] = df["prompt_tokens"] - df["prompt_token_est"]

	# Tool schema overhead proxy: the JSON size of tool definitions.
	df["tools_bytes_ratio"] = df["tools_json_bytes"] / df["meta_json_bytes"].replace(0, np.nan)
	return df


def prompt_level_rollups(chatreplay: dict[str, Any]) -> pd.DataFrame:
	prompts = chatreplay.get("prompts") or []
	rows: list[dict[str, Any]] = []
	for i, p in enumerate(prompts):
		text = p.get("prompt") or ""
		logs = p.get("logs") or []
		rows.append(
			{
				"prompt_index": i,
				"prompt_chars": len(text),
				"prompt_lines": text.count("\n") + 1,
				"logCount": p.get("logCount"),
				"logs_len": len(logs),
				"hasSeen": bool(p.get("hasSeen")),
			}
		)
	return pd.DataFrame(rows)


def similarity_matrix(prompt_texts: list[str]) -> tuple[np.ndarray, list[str]]:
	cleaned = [t if t.strip() else "(empty)" for t in prompt_texts]
	# Keep stopwords; prompts often are instruction-heavy, so stopword removal can hide repetition.
	vec = TfidfVectorizer(min_df=1, max_features=5000)
	X = vec.fit_transform(cleaned)
	sim = cosine_similarity(X)
	labels = [f"p{i}" for i in range(len(cleaned))]
	return sim, labels


def save_plot(fig: plt.Figure, out_path: Path) -> None:
	out_path.parent.mkdir(parents=True, exist_ok=True)
	fig.savefig(out_path, dpi=180, bbox_inches="tight")
	plt.close(fig)


def plot_all(df: pd.DataFrame, prompts: list[str], out_dir: Path) -> dict[str, Path]:
	charts_dir = out_dir / "charts"
	charts: dict[str, Path] = {}

	sns.set_theme(style="whitegrid")

	if not df.empty:
		fig, ax = plt.subplots(figsize=(10, 4))
		sns.histplot(df["prompt_tokens"].dropna(), bins=30, ax=ax)
		ax.set_title("Prompt tokens distribution")
		ax.set_xlabel("prompt_tokens")
		charts["prompt_tokens_hist"] = charts_dir / "prompt_tokens_hist.png"
		save_plot(fig, charts["prompt_tokens_hist"])

		fig, ax = plt.subplots(figsize=(7, 5))
		sns.scatterplot(
			data=df,
			x="prompt_tokens",
			y="completion_tokens",
			hue="model",
			ax=ax,
			legend=False,
		)
		ax.set_title("Prompt vs completion tokens")
		charts["prompt_vs_completion"] = charts_dir / "prompt_vs_completion.png"
		save_plot(fig, charts["prompt_vs_completion"])

		df_time = df.dropna(subset=["start_dt"]).sort_values("start_dt")
		if not df_time.empty:
			fig, ax = plt.subplots(figsize=(10, 4))
			ax.plot(df_time["start_dt"], df_time["prompt_tokens"], marker="o", linewidth=1)
			ax.set_title("Prompt tokens over time")
			ax.set_xlabel("time")
			ax.set_ylabel("prompt_tokens")
			charts["prompt_tokens_over_time"] = charts_dir / "prompt_tokens_over_time.png"
			save_plot(fig, charts["prompt_tokens_over_time"])

			fig, ax = plt.subplots(figsize=(10, 4))
			ax.plot(
				df_time["start_dt"],
				df_time["cached_ratio"].fillna(0),
				marker="o",
				linewidth=1,
			)
			ax.set_title("Cached token ratio over time")
			ax.set_xlabel("time")
			ax.set_ylabel("cached_tokens / prompt_tokens")
			ax.set_ylim(0, 1)
			charts["cached_ratio_over_time"] = charts_dir / "cached_ratio_over_time.png"
			save_plot(fig, charts["cached_ratio_over_time"])

			fig, ax = plt.subplots(figsize=(10, 4))
			ax.plot(
				df_time["start_dt"],
				df_time["prompt_overhead_est"],
				marker="o",
				linewidth=1,
			)
			ax.set_title("Estimated overhead tokens (prompt_tokens - user_prompt_est)")
			ax.set_xlabel("time")
			ax.set_ylabel("overhead_tokens_est")
			charts["overhead_over_time"] = charts_dir / "overhead_over_time.png"
			save_plot(fig, charts["overhead_over_time"])

		fig, ax = plt.subplots(figsize=(7, 5))
		sns.scatterplot(data=df, x="prompt_chars", y="prompt_tokens", hue="model", ax=ax, legend=False)
		ax.set_title("User prompt chars vs prompt_tokens")
		charts["chars_vs_prompt_tokens"] = charts_dir / "chars_vs_prompt_tokens.png"
		save_plot(fig, charts["chars_vs_prompt_tokens"])

		fig, ax = plt.subplots(figsize=(10, 4))
		top = (
			df.sort_values("prompt_tokens", ascending=False)
			.head(10)
			.assign(label=lambda d: d["prompt_index"].apply(lambda x: f"p{x}"))
		)
		sns.barplot(data=top, x="label", y="prompt_tokens", ax=ax)
		ax.set_title("Top 10 requests by prompt_tokens")
		charts["top_prompt_tokens"] = charts_dir / "top_prompt_tokens.png"
		save_plot(fig, charts["top_prompt_tokens"])

	if prompts:
		sim, labels = similarity_matrix(prompts)
		fig, ax = plt.subplots(figsize=(7, 6))
		sns.heatmap(sim, vmin=0, vmax=1, cmap="viridis", xticklabels=labels, yticklabels=labels, ax=ax)
		ax.set_title("Prompt text similarity (TF-IDF cosine)")
		charts["prompt_similarity"] = charts_dir / "prompt_similarity.png"
		save_plot(fig, charts["prompt_similarity"])

	return charts


def summarize_numbers(df: pd.DataFrame) -> dict[str, Any]:
	if df.empty:
		return {}

	def s(col: str) -> pd.Series:
		return pd.to_numeric(df[col], errors="coerce").dropna()

	summary: dict[str, Any] = {}
	for col in ["prompt_tokens", "completion_tokens", "total_tokens", "cached_tokens", "prompt_overhead_est", "duration_ms"]:
		values = s(col)
		if values.empty:
			continue
		summary[col] = {
			"count": int(values.shape[0]),
			"sum": float(values.sum()),
			"mean": float(values.mean()),
			"p50": float(values.quantile(0.5)),
			"p90": float(values.quantile(0.9)),
			"p99": float(values.quantile(0.99)) if values.shape[0] >= 10 else float(values.max()),
			"max": float(values.max()),
		}

	ratios = s("cached_ratio")
	if not ratios.empty:
		summary["cached_ratio"] = {
			"mean": float(ratios.mean()),
			"p50": float(ratios.quantile(0.5)),
			"max": float(ratios.max()),
		}

	pressure = s("context_pressure")
	if not pressure.empty:
		summary["context_pressure"] = {
			"mean": float(pressure.mean()),
			"p90": float(pressure.quantile(0.9)),
			"max": float(pressure.max()),
		}

	return summary


def write_report(
	*,
	out_dir: Path,
	source_path: Path,
	token_estimator: TokenEstimator,
	chatreplay: dict[str, Any],
	df: pd.DataFrame,
	prompt_df: pd.DataFrame,
	charts: dict[str, Path],
) -> tuple[Path, Path]:
	out_dir.mkdir(parents=True, exist_ok=True)

	exported_at = chatreplay.get("exportedAt")
	total_prompts = chatreplay.get("totalPrompts")
	total_logs = chatreplay.get("totalLogEntries")
	models = sorted(set(df["model"].dropna().astype(str).tolist())) if not df.empty else []
	summary = summarize_numbers(df)

	# Identify the worst offenders for context size.
	top_overhead = (
		df.sort_values("prompt_overhead_est", ascending=False)
		.head(8)[["prompt_index", "log_index", "model", "prompt_tokens", "prompt_token_est", "prompt_overhead_est", "cached_tokens", "cached_ratio"]]
		if not df.empty
		else pd.DataFrame()
	)

	top_pressure = (
		df.sort_values("context_pressure", ascending=False)
		.head(8)[["prompt_index", "log_index", "model", "prompt_tokens", "maxPromptTokens", "context_pressure"]]
		if not df.empty
		else pd.DataFrame()
	)

	# Pairwise similarity highlights repeated prompts.
	prompt_texts = (chatreplay.get("prompts") or [])
	prompt_texts = [p.get("prompt") or "" for p in prompt_texts]
	sim, labels = similarity_matrix(prompt_texts) if prompt_texts else (None, None)
	repeated_pairs: list[tuple[str, str, float]] = []
	if sim is not None and labels is not None and len(labels) > 1:
		for i in range(len(labels)):
			for j in range(i + 1, len(labels)):
				repeated_pairs.append((labels[i], labels[j], float(sim[i, j])))
		repeated_pairs.sort(key=lambda x: x[2], reverse=True)
		repeated_pairs = repeated_pairs[:10]

	def img_ref(key: str, title: str) -> str:
		p = charts.get(key)
		if not p:
			return ""
		rel = p.relative_to(out_dir).as_posix()
		return f"\n\n### {title}\n\n![]({rel})\n"

	report_md = out_dir / "report.md"
	report_html = out_dir / "report.html"

	md = []
	md.append("# GitHub Copilot chatreplay: context management analysis")
	md.append("")
	md.append(f"- Source: `{source_path}`")
	md.append(f"- Exported at: `{exported_at}`")
	md.append(f"- Total prompts: `{total_prompts}` | Total log entries: `{total_logs}`")
	md.append(f"- Token estimator for visible prompt text: `{token_estimator.name}`")
	if models:
		md.append(f"- Models observed in request metadata: {', '.join(models)}")

	md.append("\n## Executive summary")
	if summary.get("prompt_tokens"):
		md.append(
			"- Prompt tokens are dominated by hidden/system/tool context (see overhead chart/table)."
		)
		md.append(
			"- Cached token ratio indicates how much context was resent and reused; high ratios often mean repeated large system/tool payloads."
		)
		md.append(
			"- Prompt similarity heatmap highlights repeated/near-duplicate user prompts (instruction bloat/repetition)."
		)
	else:
		md.append("- No request logs with usage metadata were found.")

	md.append("\n## Key metrics")
	md.append("```json")
	md.append(json.dumps(summary, indent=2))
	md.append("```")

	md.append("\n## Context overhead (estimated)")
	md.append(
		"We estimate *visible user prompt tokens* from the exported `prompt` string, then compute:\n"
		"\n$\\text{overhead}_\\text{est} = \\text{prompt_tokens} - \\text{visible_prompt_tokens_est}$\n"
		"\nThis is not exact (Copilot may add more/less than tool schemas), but it is a strong signal when overhead is orders of magnitude larger than your prompt text."
	)
	if not top_overhead.empty:
		md.append("\nTop requests by estimated overhead:")
		md.append(top_overhead.to_markdown(index=False))

	md.append(img_ref("overhead_over_time", "Estimated overhead tokens over time"))
	md.append(img_ref("chars_vs_prompt_tokens", "Visible prompt size vs prompt_tokens"))

	md.append("\n## Cache behavior")
	md.append(
		"`cached_tokens` comes from `prompt_tokens_details.cached_tokens` when present. High cached ratios typically mean the same large prefix/context is reused across calls."
	)
	md.append(img_ref("cached_ratio_over_time", "Cached token ratio over time"))

	md.append("\n## Token usage charts")
	md.append(img_ref("prompt_tokens_hist", "Prompt tokens distribution"))
	md.append(img_ref("prompt_tokens_over_time", "Prompt tokens over time"))
	md.append(img_ref("prompt_vs_completion", "Prompt vs completion tokens"))
	md.append(img_ref("top_prompt_tokens", "Top requests by prompt_tokens"))

	md.append("\n## Prompt repetition / instruction bloat signals")
	md.append(
		"Similarity is computed on the exported prompt texts only (TF-IDF cosine). Values close to 1.0 mean near-duplicates."
	)
	if repeated_pairs:
		md.append("\nTop similar prompt pairs:")
		md.append(pd.DataFrame(repeated_pairs, columns=["a", "b", "similarity"]).to_markdown(index=False))
	md.append(img_ref("prompt_similarity", "Prompt text similarity heatmap"))

	md.append("\n## Context pressure (approaching maxPromptTokens)")
	if not top_pressure.empty:
		md.append("\nTop requests by context pressure:")
		md.append(top_pressure.to_markdown(index=False))

	md.append("\n## Practical recommendations (based on the signals above)")
	md.append(
		"- If overhead dominates, shorten *stable* system instructions and rely on tool metadata being minimal; avoid pasting large tool schemas/specs repeatedly.\n"
		"- Prefer referencing files (paths) over embedding entire documents; when you must embed, summarize and include only diffs.\n"
		"- If similarity is high across prompts, consolidate repeated instruction blocks into a single short checklist, then only vary the delta per turn.\n"
		"- If `context_pressure` is high, you’re at risk of truncation; move long specs out of the prompt and into artifacts the agent can open/read."
	)

	report_md.write_text("\n".join(md) + "\n", encoding="utf-8")

	# Very small HTML wrapper for easier viewing.
	html = """<!doctype html>
<html>
<head>
  <meta charset="utf-8" />
  <meta name="viewport" content="width=device-width, initial-scale=1" />
  <title>Copilot chatreplay analysis</title>
  <style>
	body { font-family: -apple-system, Segoe UI, Roboto, sans-serif; margin: 24px; max-width: 1100px; }
	pre { background: #f6f8fa; padding: 12px; overflow-x: auto; }
	code { background: #f6f8fa; padding: 2px 4px; }
	img { max-width: 100%; height: auto; border: 1px solid #eee; }
	table { border-collapse: collapse; }
	th, td { border: 1px solid #ddd; padding: 6px 10px; }
  </style>
</head>
<body>
  <p>Open the Markdown report for the full content:</p>
  <p><a href="report.md">report.md</a></p>
</body>
</html>
"""
	report_html.write_text(html, encoding="utf-8")

	return report_md, report_html


def build_artifacts(
	*,
	input_path: Path,
	out_dir: Path,
	token_estimator: TokenEstimator,
) -> tuple[pd.DataFrame, pd.DataFrame, dict[str, Path], Path, Path]:
	chatreplay = load_chatreplay(input_path)
	rows = list(iter_request_logs(chatreplay))
	df = pd.DataFrame(rows)
	if not df.empty:
		df = compute_prompt_text_features(df, token_estimator)
		df = compute_derived_metrics(df)

	prompt_df = prompt_level_rollups(chatreplay)

	prompts = [p.get("prompt") or "" for p in (chatreplay.get("prompts") or [])]
	charts = plot_all(df, prompts, out_dir)

	# Write raw tables for further slicing.
	tables_dir = out_dir / "tables"
	tables_dir.mkdir(parents=True, exist_ok=True)
	df.to_csv(tables_dir / "requests.csv", index=False)
	prompt_df.to_csv(tables_dir / "prompts.csv", index=False)

	report_md, report_html = write_report(
		out_dir=out_dir,
		source_path=input_path,
		token_estimator=token_estimator,
		chatreplay=chatreplay,
		df=df,
		prompt_df=prompt_df,
		charts=charts,
	)

	return df, prompt_df, charts, report_md, report_html


def choose_token_estimator(prefer_tiktoken: bool = True) -> TokenEstimator:
	if prefer_tiktoken:
		try:
			return TiktokenEstimator("cl100k_base")
		except Exception:
			return HeuristicEstimator()
	return HeuristicEstimator()


def main(argv: list[str] | None = None) -> int:
	parser = argparse.ArgumentParser(
		description="Analyze GitHub Copilot chatreplay JSON for context/token management patterns and generate charts + report."
	)
	parser.add_argument(
		"--input",
		required=True,
		help="Path to *.chatreplay.json",
	)
	parser.add_argument(
		"--out",
		default=None,
		help="Output directory (default: ./agentic_sdlc/.output/<timestamp>/)",
	)
	parser.add_argument(
		"--no-tiktoken",
		action="store_true",
		help="Disable tiktoken and use a heuristic token estimator",
	)

	args = parser.parse_args(argv)
	input_path = Path(args.input)

	if not input_path.exists():
		raise SystemExit(f"Input file not found: {input_path}")

	stamp = dt.datetime.now().strftime("%Y%m%d_%H%M%S")
	out_dir = Path(args.out) if args.out else Path("agentic_sdlc") / ".output" / stamp
	out_dir.mkdir(parents=True, exist_ok=True)

	token_estimator = choose_token_estimator(prefer_tiktoken=not args.no_tiktoken)
	_, _, _, report_md, report_html = build_artifacts(
		input_path=input_path,
		out_dir=out_dir,
		token_estimator=token_estimator,
	)

	print(str(report_md))
	print(str(report_html))
	return 0


if __name__ == "__main__":
	raise SystemExit(main())
	from __future__ import annotations

	import argparse
	import datetime as dt
	import json
	import math
	import re
	import statistics
	from dataclasses import dataclass
	from pathlib import Path
	from typing import Any, Iterable

	import matplotlib

	matplotlib.use("Agg")

	import matplotlib.pyplot as plt
	import numpy as np
	import pandas as pd
	import seaborn as sns
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity


	@dataclass(frozen=True)
	class TokenEstimator:
	name: str

	def count(self, text: str) -> int:
	raise NotImplementedError


	class TiktokenEstimator(TokenEstimator):
	def __init__(self, encoding_name: str = "cl100k_base"):
	super().__init__(name=f"tiktoken:{encoding_name}")
	import tiktoken # type: ignore

	self._enc = tiktoken.get_encoding(encoding_name)

	def count(self, text: str) -> int:
	if not text:
	return 0
	return len(self._enc.encode(text))


	class HeuristicEstimator(TokenEstimator):
	def __init__(self):
	super().__init__(name="heuristic:chars/4")

	def count(self, text: str) -> int:
	if not text:
	return 0
	# Common rough heuristic: ~4 chars per token for English-ish text.
	return max(1, math.ceil(len(text) / 4))


	def _parse_iso(ts: str \| None) -> dt.datetime \| None:
	if not ts:
	return None
	# Example: 2026-01-06T21:38:52.766Z
	try:
	if ts.endswith("Z"):
	ts = ts[:-1] + "+00:00"
	return dt.datetime.fromisoformat(ts)
	except Exception:
	return None


	def _safe_filename(value: str) -> str:
	value = value.strip()
	value = re.sub(r"\s+", " ", value)
	value = re.sub(r"[^a-zA-Z0-9._ -]+", "_", value)
	value = value.strip(" .-")
	return value or "artifact"


	def _json_size_bytes(obj: Any) -> int:
	try:
	return len(json.dumps(obj, ensure_ascii=False).encode("utf-8"))
	except Exception:
	return 0


	def load_chatreplay(path: Path) -> dict[str, Any]:
	with path.open("r", encoding="utf-8") as f:
	return json.load(f)


	def iter_request_logs(chatreplay: dict[str, Any]) -> Iterable[dict[str, Any]]:
	prompts = chatreplay.get("prompts") or []
	for prompt_idx, prompt in enumerate(prompts):
	logs = prompt.get("logs") or []
	for log_idx, log in enumerate(logs):
	if log.get("kind") != "request":
	continue
	meta = log.get("metadata") or {}
	usage = (meta.get("usage") or {}) if isinstance(meta, dict) else {}
	yield {
	"prompt_index": prompt_idx,
	"log_index": log_idx,
	"prompt_text": prompt.get("prompt") or "",
	"hasSeen": bool(prompt.get("hasSeen")),
	"log_id": log.get("id"),
	"log_type": log.get("type"),
	"log_name": log.get("name"),
	"requestType": meta.get("requestType"),
	"model": meta.get("model"),
	"maxPromptTokens": meta.get("maxPromptTokens"),
	"maxResponseTokens": meta.get("maxResponseTokens"),
	"location": meta.get("location"),
	"startTime": meta.get("startTime"),
	"endTime": meta.get("endTime"),
	"duration_ms": meta.get("duration"),
	"timeToFirstToken_ms": meta.get("timeToFirstToken"),
	"prompt_tokens": usage.get("prompt_tokens"),
	"completion_tokens": usage.get("completion_tokens"),
	"total_tokens": usage.get("total_tokens"),
	"cached_tokens": (usage.get("prompt_tokens_details") or {}).get(
	"cached_tokens"
	),
	"tools_count": len(meta.get("tools") or []) if isinstance(meta, dict) else 0,
	"tools_json_bytes": _json_size_bytes(meta.get("tools") or []),
	"meta_json_bytes": _json_size_bytes(meta),
	}


	def compute_prompt_text_features(df: pd.DataFrame, token_estimator: TokenEstimator) -> pd.DataFrame:
	text = df["prompt_text"].fillna("").astype(str)
	df = df.copy()
	df["prompt_chars"] = text.str.len()
	df["prompt_lines"] = text.str.count(r"\n") + 1
	df["prompt_words"] = text.str.count(r"\S+")
	df["prompt_code_fences"] = text.str.count(r"```")
	df["prompt_backticks"] = text.str.count(r"`")
	df["prompt_bullets"] = text.str.count(r"(?m)^(\s[-]\|\s*\d+\.)\s+")
	df["prompt_filepaths"] = text.str.count(r"(?i)[A-Z]:\\[^\n\r\t]+") + text.str.count(
	r"file:///[^\s\)\]]+"
	)
	df["prompt_imperatives"] = (
	text.str.count(r"(?im)\b(must\|always\|never\|ensure\|required\|required:\|do not)\b")
	)
	df["prompt_token_est"] = [token_estimator.count(t) for t in text.tolist()]
	return df


	def compute_derived_metrics(df: pd.DataFrame) -> pd.DataFrame:
	df = df.copy()

	def _to_num(series: pd.Series) -> pd.Series:
	return pd.to_numeric(series, errors="coerce")

	for col in [
	"maxPromptTokens",
	"duration_ms",
	"timeToFirstToken_ms",
	"prompt_tokens",
	"completion_tokens",
	"total_tokens",
	"cached_tokens",
	"tools_count",
	"tools_json_bytes",
	"meta_json_bytes",
	"prompt_token_est",
	"prompt_chars",
	"prompt_lines",
	"prompt_bullets",
	"prompt_filepaths",
	"prompt_imperatives",
	]:
	if col in df.columns:
	df[col] = _to_num(df[col])

	df["start_dt"] = df["startTime"].apply(_parse_iso)
	df["end_dt"] = df["endTime"].apply(_parse_iso)

	df["cached_ratio"] = df["cached_tokens"] / df["prompt_tokens"]
	df["completion_ratio"] = df["completion_tokens"] / df["total_tokens"]
	df["context_pressure"] = df["prompt_tokens"] / df["maxPromptTokens"]

	# Approximate overhead: prompt_tokens minus estimated tokens in the explicit user prompt string.
	# This tends to capture system prompt + tool schemas + hidden context included by Copilot.
	df["prompt_overhead_est"] = df["prompt_tokens"] - df["prompt_token_est"]

	# Tool schema overhead proxy: the JSON size of tool definitions.
	df["tools_bytes_ratio"] = df["tools_json_bytes"] / df["meta_json_bytes"].replace(0, np.nan)
	return df


	def prompt_level_rollups(chatreplay: dict[str, Any]) -> pd.DataFrame:
	prompts = chatreplay.get("prompts") or []
	rows: list[dict[str, Any]] = []
	for i, p in enumerate(prompts):
	text = p.get("prompt") or ""
	logs = p.get("logs") or []
	rows.append(
	{
	"prompt_index": i,
	"prompt_chars": len(text),
	"prompt_lines": text.count("\n") + 1,
	"logCount": p.get("logCount"),
	"logs_len": len(logs),
	"hasSeen": bool(p.get("hasSeen")),
	}
	)
	return pd.DataFrame(rows)


	def similarity_matrix(prompt_texts: list[str]) -> tuple[np.ndarray, list[str]]:
	cleaned = [t if t.strip() else "(empty)" for t in prompt_texts]
	# Keep stopwords; prompts often are instruction-heavy, so stopword removal can hide repetition.
	vec = TfidfVectorizer(min_df=1, max_features=5000)
	X = vec.fit_transform(cleaned)
	sim = cosine_similarity(X)
	labels = [f"p{i}" for i in range(len(cleaned))]
	return sim, labels


	def save_plot(fig: plt.Figure, out_path: Path) -> None:
	out_path.parent.mkdir(parents=True, exist_ok=True)
	fig.savefig(out_path, dpi=180, bbox_inches="tight")
	plt.close(fig)


	def plot_all(df: pd.DataFrame, prompts: list[str], out_dir: Path) -> dict[str, Path]:
	charts_dir = out_dir / "charts"
	charts: dict[str, Path] = {}

	sns.set_theme(style="whitegrid")

	if not df.empty:
	fig, ax = plt.subplots(figsize=(10, 4))
	sns.histplot(df["prompt_tokens"].dropna(), bins=30, ax=ax)
	ax.set_title("Prompt tokens distribution")
	ax.set_xlabel("prompt_tokens")
	charts["prompt_tokens_hist"] = charts_dir / "prompt_tokens_hist.png"
	save_plot(fig, charts["prompt_tokens_hist"])

	fig, ax = plt.subplots(figsize=(7, 5))
	sns.scatterplot(
	data=df,
	x="prompt_tokens",
	y="completion_tokens",
	hue="model",
	ax=ax,
	legend=False,
	)
	ax.set_title("Prompt vs completion tokens")
	charts["prompt_vs_completion"] = charts_dir / "prompt_vs_completion.png"
	save_plot(fig, charts["prompt_vs_completion"])

	df_time = df.dropna(subset=["start_dt"]).sort_values("start_dt")
	if not df_time.empty:
	fig, ax = plt.subplots(figsize=(10, 4))
	ax.plot(df_time["start_dt"], df_time["prompt_tokens"], marker="o", linewidth=1)
	ax.set_title("Prompt tokens over time")
	ax.set_xlabel("time")
	ax.set_ylabel("prompt_tokens")
	charts["prompt_tokens_over_time"] = charts_dir / "prompt_tokens_over_time.png"
	save_plot(fig, charts["prompt_tokens_over_time"])

	fig, ax = plt.subplots(figsize=(10, 4))
	ax.plot(
	df_time["start_dt"],
	df_time["cached_ratio"].fillna(0),
	marker="o",
	linewidth=1,
	)
	ax.set_title("Cached token ratio over time")
	ax.set_xlabel("time")
	ax.set_ylabel("cached_tokens / prompt_tokens")
	ax.set_ylim(0, 1)
	charts["cached_ratio_over_time"] = charts_dir / "cached_ratio_over_time.png"
	save_plot(fig, charts["cached_ratio_over_time"])

	fig, ax = plt.subplots(figsize=(10, 4))
	ax.plot(
	df_time["start_dt"],
	df_time["prompt_overhead_est"],
	marker="o",
	linewidth=1,
	)
	ax.set_title("Estimated overhead tokens (prompt_tokens - user_prompt_est)")
	ax.set_xlabel("time")
	ax.set_ylabel("overhead_tokens_est")
	charts["overhead_over_time"] = charts_dir / "overhead_over_time.png"
	save_plot(fig, charts["overhead_over_time"])

	fig, ax = plt.subplots(figsize=(7, 5))
	sns.scatterplot(data=df, x="prompt_chars", y="prompt_tokens", hue="model", ax=ax, legend=False)
	ax.set_title("User prompt chars vs prompt_tokens")
	charts["chars_vs_prompt_tokens"] = charts_dir / "chars_vs_prompt_tokens.png"
	save_plot(fig, charts["chars_vs_prompt_tokens"])

	fig, ax = plt.subplots(figsize=(10, 4))
	top = (
	df.sort_values("prompt_tokens", ascending=False)
	.head(10)
	.assign(label=lambda d: d["prompt_index"].apply(lambda x: f"p{x}"))
	)
	sns.barplot(data=top, x="label", y="prompt_tokens", ax=ax)
	ax.set_title("Top 10 requests by prompt_tokens")
	charts["top_prompt_tokens"] = charts_dir / "top_prompt_tokens.png"
	save_plot(fig, charts["top_prompt_tokens"])

	if prompts:
	sim, labels = similarity_matrix(prompts)
	fig, ax = plt.subplots(figsize=(7, 6))
	sns.heatmap(sim, vmin=0, vmax=1, cmap="viridis", xticklabels=labels, yticklabels=labels, ax=ax)
	ax.set_title("Prompt text similarity (TF-IDF cosine)")
	charts["prompt_similarity"] = charts_dir / "prompt_similarity.png"
	save_plot(fig, charts["prompt_similarity"])

	return charts


	def summarize_numbers(df: pd.DataFrame) -> dict[str, Any]:
	if df.empty:
	return {}

	def s(col: str) -> pd.Series:
	return pd.to_numeric(df[col], errors="coerce").dropna()

	summary: dict[str, Any] = {}
	for col in ["prompt_tokens", "completion_tokens", "total_tokens", "cached_tokens", "prompt_overhead_est", "duration_ms"]:
	values = s(col)
	if values.empty:
	continue
	summary[col] = {
	"count": int(values.shape[0]),
	"sum": float(values.sum()),
	"mean": float(values.mean()),
	"p50": float(values.quantile(0.5)),
	"p90": float(values.quantile(0.9)),
	"p99": float(values.quantile(0.99)) if values.shape[0] >= 10 else float(values.max()),
	"max": float(values.max()),
	}

	ratios = s("cached_ratio")
	if not ratios.empty:
	summary["cached_ratio"] = {
	"mean": float(ratios.mean()),
	"p50": float(ratios.quantile(0.5)),
	"max": float(ratios.max()),
	}

	pressure = s("context_pressure")
	if not pressure.empty:
	summary["context_pressure"] = {
	"mean": float(pressure.mean()),
	"p90": float(pressure.quantile(0.9)),
	"max": float(pressure.max()),
	}

	return summary


	def write_report(
	*,
	out_dir: Path,
	source_path: Path,
	token_estimator: TokenEstimator,
	chatreplay: dict[str, Any],
	df: pd.DataFrame,
	prompt_df: pd.DataFrame,
	charts: dict[str, Path],
	) -> tuple[Path, Path]:
	out_dir.mkdir(parents=True, exist_ok=True)

	exported_at = chatreplay.get("exportedAt")
	total_prompts = chatreplay.get("totalPrompts")
	total_logs = chatreplay.get("totalLogEntries")
	models = sorted(set(df["model"].dropna().astype(str).tolist())) if not df.empty else []
	summary = summarize_numbers(df)

	# Identify the worst offenders for context size.
	top_overhead = (
	df.sort_values("prompt_overhead_est", ascending=False)
	.head(8)[["prompt_index", "log_index", "model", "prompt_tokens", "prompt_token_est", "prompt_overhead_est", "cached_tokens", "cached_ratio"]]
	if not df.empty
	else pd.DataFrame()
	)

	top_pressure = (
	df.sort_values("context_pressure", ascending=False)
	.head(8)[["prompt_index", "log_index", "model", "prompt_tokens", "maxPromptTokens", "context_pressure"]]
	if not df.empty
	else pd.DataFrame()
	)

	# Pairwise similarity highlights repeated prompts.
	prompt_texts = (chatreplay.get("prompts") or [])
	prompt_texts = [p.get("prompt") or "" for p in prompt_texts]
	sim, labels = similarity_matrix(prompt_texts) if prompt_texts else (None, None)
	repeated_pairs: list[tuple[str, str, float]] = []
	if sim is not None and labels is not None and len(labels) > 1:
	for i in range(len(labels)):
	for j in range(i + 1, len(labels)):
	repeated_pairs.append((labels[i], labels[j], float(sim[i, j])))
	repeated_pairs.sort(key=lambda x: x[2], reverse=True)
	repeated_pairs = repeated_pairs[:10]

	def img_ref(key: str, title: str) -> str:
	p = charts.get(key)
	if not p:
	return ""
	rel = p.relative_to(out_dir).as_posix()
	return f"\n\n### {title}\n\n![]({rel})\n"

	report_md = out_dir / "report.md"
	report_html = out_dir / "report.html"

	md = []
	md.append("# GitHub Copilot chatreplay: context management analysis")
	md.append("")
	md.append(f"- Source: `{source_path}`")
	md.append(f"- Exported at: `{exported_at}`")
	md.append(f"- Total prompts: `{total_prompts}` \| Total log entries: `{total_logs}`")
	md.append(f"- Token estimator for visible prompt text: `{token_estimator.name}`")
	if models:
	md.append(f"- Models observed in request metadata: {', '.join(models)}")

	md.append("\n## Executive summary")
	if summary.get("prompt_tokens"):
	md.append(
	"- Prompt tokens are dominated by hidden/system/tool context (see overhead chart/table)."
	)
	md.append(
	"- Cached token ratio indicates how much context was resent and reused; high ratios often mean repeated large system/tool payloads."
	)
	md.append(
	"- Prompt similarity heatmap highlights repeated/near-duplicate user prompts (instruction bloat/repetition)."
	)
	else:
	md.append("- No request logs with usage metadata were found.")

	md.append("\n## Key metrics")
	md.append("```json")
	md.append(json.dumps(summary, indent=2))
	md.append("```")

	md.append("\n## Context overhead (estimated)")
	md.append(
	"We estimate visible user prompt tokens from the exported `prompt` string, then compute:\n"
	"\n$\\text{overhead}_\\text{est} = \\text{prompt_tokens} - \\text{visible_prompt_tokens_est}$\n"
	"\nThis is not exact (Copilot may add more/less than tool schemas), but it is a strong signal when overhead is orders of magnitude larger than your prompt text."
	)
	if not top_overhead.empty:
	md.append("\nTop requests by estimated overhead:")
	md.append(top_overhead.to_markdown(index=False))

	md.append(img_ref("overhead_over_time", "Estimated overhead tokens over time"))
	md.append(img_ref("chars_vs_prompt_tokens", "Visible prompt size vs prompt_tokens"))

	md.append("\n## Cache behavior")
	md.append(
	"`cached_tokens` comes from `prompt_tokens_details.cached_tokens` when present. High cached ratios typically mean the same large prefix/context is reused across calls."
	)
	md.append(img_ref("cached_ratio_over_time", "Cached token ratio over time"))

	md.append("\n## Token usage charts")
	md.append(img_ref("prompt_tokens_hist", "Prompt tokens distribution"))
	md.append(img_ref("prompt_tokens_over_time", "Prompt tokens over time"))
	md.append(img_ref("prompt_vs_completion", "Prompt vs completion tokens"))
	md.append(img_ref("top_prompt_tokens", "Top requests by prompt_tokens"))

	md.append("\n## Prompt repetition / instruction bloat signals")
	md.append(
	"Similarity is computed on the exported prompt texts only (TF-IDF cosine). Values close to 1.0 mean near-duplicates."
	)
	if repeated_pairs:
	md.append("\nTop similar prompt pairs:")
	md.append(pd.DataFrame(repeated_pairs, columns=["a", "b", "similarity"]).to_markdown(index=False))
	md.append(img_ref("prompt_similarity", "Prompt text similarity heatmap"))

	md.append("\n## Context pressure (approaching maxPromptTokens)")
	if not top_pressure.empty:
	md.append("\nTop requests by context pressure:")
	md.append(top_pressure.to_markdown(index=False))

	md.append("\n## Practical recommendations (based on the signals above)")
	md.append(
	"- If overhead dominates, shorten stable system instructions and rely on tool metadata being minimal; avoid pasting large tool schemas/specs repeatedly.\n"
	"- Prefer referencing files (paths) over embedding entire documents; when you must embed, summarize and include only diffs.\n"
	"- If similarity is high across prompts, consolidate repeated instruction blocks into a single short checklist, then only vary the delta per turn.\n"
	"- If `context_pressure` is high, you’re at risk of truncation; move long specs out of the prompt and into artifacts the agent can open/read."
	)

	report_md.write_text("\n".join(md) + "\n", encoding="utf-8")

	# Very small HTML wrapper for easier viewing.
	html = """<!doctype html>
	<html>
	<head>
	<meta charset="utf-8" />
	<meta name="viewport" content="width=device-width, initial-scale=1" />
	<title>Copilot chatreplay analysis</title>
	<style>
	body { font-family: -apple-system, Segoe UI, Roboto, sans-serif; margin: 24px; max-width: 1100px; }
	pre { background: #f6f8fa; padding: 12px; overflow-x: auto; }
	code { background: #f6f8fa; padding: 2px 4px; }
	img { max-width: 100%; height: auto; border: 1px solid #eee; }
	table { border-collapse: collapse; }
	th, td { border: 1px solid #ddd; padding: 6px 10px; }
	</style>
	</head>
	<body>
	<p>Open the Markdown report for the full content:</p>
	<p><a href="report.md">report.md</a></p>
	</body>
	</html>
	"""
	report_html.write_text(html, encoding="utf-8")

	return report_md, report_html


	def build_artifacts(
	*,
	input_path: Path,
	out_dir: Path,
	token_estimator: TokenEstimator,
	) -> tuple[pd.DataFrame, pd.DataFrame, dict[str, Path], Path, Path]:
	chatreplay = load_chatreplay(input_path)
	rows = list(iter_request_logs(chatreplay))
	df = pd.DataFrame(rows)
	if not df.empty:
	df = compute_prompt_text_features(df, token_estimator)
	df = compute_derived_metrics(df)

	prompt_df = prompt_level_rollups(chatreplay)

	prompts = [p.get("prompt") or "" for p in (chatreplay.get("prompts") or [])]
	charts = plot_all(df, prompts, out_dir)

	# Write raw tables for further slicing.
	tables_dir = out_dir / "tables"
	tables_dir.mkdir(parents=True, exist_ok=True)
	df.to_csv(tables_dir / "requests.csv", index=False)
	prompt_df.to_csv(tables_dir / "prompts.csv", index=False)

	report_md, report_html = write_report(
	out_dir=out_dir,
	source_path=input_path,
	token_estimator=token_estimator,
	chatreplay=chatreplay,
	df=df,
	prompt_df=prompt_df,
	charts=charts,
	)

	return df, prompt_df, charts, report_md, report_html


	def choose_token_estimator(prefer_tiktoken: bool = True) -> TokenEstimator:
	if prefer_tiktoken:
	try:
	return TiktokenEstimator("cl100k_base")
	except Exception:
	return HeuristicEstimator()
	return HeuristicEstimator()


	def main(argv: list[str] \| None = None) -> int:
	parser = argparse.ArgumentParser(
	description="Analyze GitHub Copilot chatreplay JSON for context/token management patterns and generate charts + report."
	)
	parser.add_argument(
	"--input",
	required=True,
	help="Path to *.chatreplay.json",
	)
	parser.add_argument(
	"--out",
	default=None,
	help="Output directory (default: ./agentic_sdlc/.output/<timestamp>/)",
	)
	parser.add_argument(
	"--no-tiktoken",
	action="store_true",
	help="Disable tiktoken and use a heuristic token estimator",
	)

	args = parser.parse_args(argv)
	input_path = Path(args.input)

	if not input_path.exists():
	raise SystemExit(f"Input file not found: {input_path}")

	stamp = dt.datetime.now().strftime("%Y%m%d_%H%M%S")
	out_dir = Path(args.out) if args.out else Path("agentic_sdlc") / ".output" / stamp
	out_dir.mkdir(parents=True, exist_ok=True)

	token_estimator = choose_token_estimator(prefer_tiktoken=not args.no_tiktoken)
	_, _, _, report_md, report_html = build_artifacts(
	input_path=input_path,
	out_dir=out_dir,
	token_estimator=token_estimator,
	)

	print(str(report_md))
	print(str(report_html))
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())
No results found