Skip to content

Instantly share code, notes, and snippets.

@jshahbazi
Last active March 8, 2026 19:36
Show Gist options
  • Select an option

  • Save jshahbazi/66c1515897d9e9355ae921a34c4be479 to your computer and use it in GitHub Desktop.

Select an option

Save jshahbazi/66c1515897d9e9355ae921a34c4be479 to your computer and use it in GitHub Desktop.
Extract all VS Code GitHub Copilot Chat conversations and export them to Markdown
#!/usr/bin/env python3
"""
Extract all VS Code GitHub Copilot Chat conversations.
Scans both global and per-workspace storage to find all chat sessions,
then exports them as readable Markdown files in an output directory.
Usage:
python3 extract_copilot_chats.py [--output-dir ./copilot_chats] [--format markdown|json]
Storage locations (macOS):
~/Library/Application Support/Code/User/globalStorage/state.vscdb
~/Library/Application Support/Code/User/globalStorage/emptyWindowChatSessions/
~/Library/Application Support/Code/User/workspaceStorage/<hash>/state.vscdb
~/Library/Application Support/Code/User/workspaceStorage/<hash>/chatSessions/
Format:
```
{DATE}__{WORKSPACE}__{TITLE}__{SESSION_ID}.{ext}
```
**Segments are separated by double underscores (`__`).** There are exactly 4 segments:
| # | Segment | Format | Example |
|---|---------|--------|---------|
| 1 | `DATE` | `YYYY-MM-DD` | `2026-03-07` |
| 2 | `WORKSPACE` | Sanitized folder name (spaces→`_`, special chars removed) | `angry-allen` |
| 3 | `TITLE` | Sanitized chat title (spaces→`_`, special chars removed, max 80 chars) | `Sorting_files_by_date_using_ll_command` |
| 4 | `SESSION_ID` | First 8 chars of the UUID session ID | `6561947e` |
**Extensions:** `.md` (Markdown) and/or `.json` (structured JSON)
**Parsing regex:**
```python
import re
pattern = r'^(\\d{4}-\\d{2}-\\d{2})__(.+?)__(.+?)__([a-f0-9]{8})\\.(md|json)$'
match = re.match(pattern, filename)
date, workspace, title, session_id, ext = match.groups()
```
**JSON file structure** (easier to work with programmatically):
```json
{
"sessionId": "6561947e-b913-...",
"title": "Sorting files by date using ll command",
"workspace": "angry-allen",
"created": "2026-03-07 18:08:37",
"turns": [
{
"timestamp": "2026-03-07 18:36:23",
"model": "copilot/gpt-5.3-codex",
"user": "ll command to sort by date",
"assistant": "Use:\n```bash\nll -t\n```\n..."
}
]
}
```
**Sorting:** Files naturally sort chronologically by date when listed alphabetically. Within the same date, they sort by workspace then title.
**Recommendation for an agent:** Point it at the `.json` files rather than parsing Markdown — the JSON has the same data in a structured format with `turns[].user` and `turns[].assistant` fields ready to summarize.
"""
import argparse
import json
import os
import re
import sqlite3
import sys
from datetime import datetime
from pathlib import Path
def get_vscode_base() -> Path:
"""Return the VS Code user data directory for the current platform."""
if sys.platform == "darwin":
return Path.home() / "Library" / "Application Support" / "Code" / "User"
elif sys.platform == "win32":
return Path(os.environ.get("APPDATA", "")) / "Code" / "User"
else: # Linux
return Path.home() / ".config" / "Code" / "User"
def read_db_key(db_path: Path, key: str):
"""Read a single key from a VS Code state.vscdb SQLite database."""
if not db_path.exists():
return None
try:
conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
cur = conn.cursor()
cur.execute("SELECT value FROM ItemTable WHERE key = ?", (key,))
row = cur.fetchone()
conn.close()
if row:
return json.loads(row[0])
except Exception:
pass
return None
def read_db_keys_like(db_path: Path, pattern: str):
"""Read all keys matching a LIKE pattern from state.vscdb."""
if not db_path.exists():
return {}
try:
conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
cur = conn.cursor()
cur.execute("SELECT key, value FROM ItemTable WHERE key LIKE ?", (pattern,))
rows = cur.fetchall()
conn.close()
result = {}
for key, val in rows:
try:
result[key] = json.loads(val)
except json.JSONDecodeError:
result[key] = val
return result
except Exception:
return {}
def get_workspace_name(ws_dir: Path) -> str:
"""Get the workspace folder name from workspace.json."""
ws_json = ws_dir / "workspace.json"
if ws_json.exists():
try:
data = json.loads(ws_json.read_text())
folder = data.get("folder", "")
# Extract just the folder name from the URI
if folder.startswith("file:///"):
return folder[7:].rstrip("/").split("/")[-1]
return folder
except Exception:
pass
return ws_dir.name[:12]
def parse_session_jsonl(jsonl_path: Path) -> dict:
"""Parse a chat session .jsonl file into a structured dict.
JSONL format:
Line 0 (kind=0): Session metadata (version, creationDate, sessionId, ...)
Lines with kind=1: Property patches (e.g., customTitle)
Lines with kind=2: Array patches (requests with user messages + responses)
"""
if not jsonl_path.exists():
return None
session = {
"sessionId": jsonl_path.stem,
"title": None,
"creationDate": None,
"requests": [],
}
try:
with open(jsonl_path) as f:
for line in f:
line = line.strip()
if not line:
continue
entry = json.loads(line)
kind = entry.get("kind")
if kind == 0:
# Session metadata
v = entry.get("v", {})
session["creationDate"] = v.get("creationDate")
session["sessionId"] = v.get("sessionId", session["sessionId"])
elif kind == 1:
# Property patch
keys = entry.get("k", [])
val = entry.get("v")
if keys == ["customTitle"] and isinstance(val, str):
session["title"] = val
elif kind == 2:
# Array patch (requests)
keys = entry.get("k", [])
val = entry.get("v")
if keys == ["requests"] and isinstance(val, list):
session["requests"].extend(val)
except Exception as e:
print(f" Warning: Failed to parse {jsonl_path}: {e}", file=sys.stderr)
return None
return session
def extract_response_text(response_parts: list) -> str:
"""Extract readable text from response parts."""
texts = []
for part in response_parts:
if isinstance(part, dict):
# MarkdownString response
if "value" in part and isinstance(part["value"], str):
val = part["value"]
# Skip empty or tool-only responses
if val.strip() and not val.startswith("{"):
texts.append(val)
# Structured text response
elif "content" in part and isinstance(part["content"], list):
for c in part["content"]:
if isinstance(c, dict) and "value" in c:
texts.append(c["value"])
elif isinstance(part, str):
texts.append(part)
return "\n".join(texts)
def extract_tool_calls(response_parts: list) -> list:
"""Extract tool call info from response parts."""
tools = []
for part in response_parts:
if isinstance(part, dict):
kind = part.get("kind")
if kind == "toolCall":
tool_name = part.get("toolName", part.get("name", "unknown"))
tools.append(tool_name)
return tools
def format_timestamp(ts) -> str:
"""Format a millisecond timestamp to readable date string."""
if not ts:
return "unknown date"
try:
dt = datetime.fromtimestamp(ts / 1000)
return dt.strftime("%Y-%m-%d %H:%M:%S")
except Exception:
return str(ts)
def session_to_markdown(session: dict, workspace_name: str = "") -> str:
"""Convert a parsed session to a readable Markdown document."""
lines = []
title = session.get("title") or "Untitled Chat"
created = format_timestamp(session.get("creationDate"))
lines.append(f"# {title}")
lines.append("")
if workspace_name:
lines.append(f"**Workspace:** {workspace_name}")
lines.append(f"**Created:** {created}")
lines.append(f"**Session ID:** {session.get('sessionId', 'unknown')}")
lines.append("")
lines.append("---")
lines.append("")
requests = session.get("requests", [])
if not requests:
lines.append("*(Empty conversation)*")
return "\n".join(lines)
for i, req in enumerate(requests, 1):
# User message
message = req.get("message", {})
if isinstance(message, dict):
user_text = message.get("text", "")
else:
user_text = str(message)
timestamp = format_timestamp(req.get("timestamp"))
model_id = req.get("modelId", "")
lines.append(f"## Turn {i}")
lines.append("")
if model_id:
lines.append(f"*Model: {model_id} | {timestamp}*")
lines.append("")
lines.append("### User")
lines.append("")
lines.append(user_text if user_text else "*(empty message)*")
lines.append("")
# Assistant response
response = req.get("response", [])
if isinstance(response, list):
response_text = extract_response_text(response)
tool_calls = extract_tool_calls(response)
else:
response_text = str(response) if response else ""
tool_calls = []
lines.append("### Assistant")
lines.append("")
if tool_calls:
lines.append(f"*Tool calls: {", ".join(tool_calls)}*")
lines.append("")
lines.append(response_text if response_text else "*(no response text)*")
lines.append("")
lines.append("---")
lines.append("")
return "\n".join(lines)
def session_to_json(session: dict, workspace_name: str = "") -> dict:
"""Convert a parsed session to a clean JSON-safe dict."""
requests = session.get("requests", [])
turns = []
for req in requests:
message = req.get("message", {})
user_text = message.get("text", "") if isinstance(message, dict) else str(message)
response = req.get("response", [])
response_text = extract_response_text(response) if isinstance(response, list) else str(response)
turns.append({
"timestamp": format_timestamp(req.get("timestamp")),
"model": req.get("modelId", ""),
"user": user_text,
"assistant": response_text,
})
return {
"sessionId": session.get("sessionId"),
"title": session.get("title") or "Untitled Chat",
"workspace": workspace_name,
"created": format_timestamp(session.get("creationDate")),
"turns": turns,
}
def sanitize_filename(name: str, max_len: int = 80) -> str:
"""Make a string safe for use as a filename."""
name = re.sub(r'[^\w\s\-.]', '', name)
name = re.sub(r'\s+', '_', name.strip())
return name[:max_len] if name else "untitled"
def discover_all_sessions(base: Path) -> list:
"""Discover all chat sessions across all workspaces."""
sessions = []
# 1. Global empty-window sessions
global_sessions_dir = base / "globalStorage" / "emptyWindowChatSessions"
if global_sessions_dir.is_dir():
for f in global_sessions_dir.iterdir():
if f.suffix == ".jsonl":
session = parse_session_jsonl(f)
if session:
sessions.append(("(no workspace)", session))
# 2. Per-workspace sessions
ws_base = base / "workspaceStorage"
if ws_base.is_dir():
for ws_dir in ws_base.iterdir():
if not ws_dir.is_dir():
continue
workspace_name = get_workspace_name(ws_dir)
chat_dir = ws_dir / "chatSessions"
if chat_dir.is_dir():
# Get session titles from the session index
index = read_db_key(ws_dir / "state.vscdb", "chat.ChatSessionStore.index")
title_map = {}
if index and "entries" in index:
for sid, meta in index["entries"].items():
if "title" in meta:
title_map[sid] = meta["title"]
for f in chat_dir.iterdir():
if f.suffix == ".jsonl":
session = parse_session_jsonl(f)
if session:
# Fill in title from index if not in JSONL
if not session.get("title") and session["sessionId"] in title_map:
session["title"] = title_map[session["sessionId"]]
sessions.append((workspace_name, session))
return sessions
def main():
parser = argparse.ArgumentParser(description="Extract VS Code Copilot Chat conversations")
parser.add_argument("--output-dir", "-o", default="./copilot_chats",
help="Output directory (default: ./copilot_chats)")
parser.add_argument("--format", "-f", choices=["markdown", "json", "both"], default="markdown",
help="Output format (default: markdown)")
parser.add_argument("--list", "-l", action="store_true",
help="List sessions without extracting")
parser.add_argument("--vscode-dir", default=None,
help="Override VS Code user data directory")
args = parser.parse_args()
base = Path(args.vscode_dir) if args.vscode_dir else get_vscode_base()
if not base.exists():
print(f"Error: VS Code data directory not found: {base}", file=sys.stderr)
sys.exit(1)
print(f"Scanning: {base}")
sessions = discover_all_sessions(base)
# Sort by creation date (newest first)
sessions.sort(key=lambda x: x[1].get("creationDate") or 0, reverse=True)
print(f"Found {len(sessions)} chat sessions\n")
if args.list:
for workspace_name, session in sessions:
title = session.get("title") or "Untitled"
created = format_timestamp(session.get("creationDate"))
num_turns = len(session.get("requests", []))
print(f" [{workspace_name}] {title} ({num_turns} turns, {created})")
return
# Export sessions
out_dir = Path(args.output_dir)
out_dir.mkdir(parents=True, exist_ok=True)
for workspace_name, session in sessions:
title = session.get("title") or "Untitled"
sid = session.get("sessionId", "unknown")
num_turns = len(session.get("requests", []))
if num_turns == 0:
continue
safe_title = sanitize_filename(title)
safe_ws = sanitize_filename(workspace_name)
# Format creation date as YYYY-MM-DD for the filename
creation_ts = session.get("creationDate")
if creation_ts:
date_str = datetime.fromtimestamp(creation_ts / 1000).strftime("%Y-%m-%d")
else:
date_str = "unknown-date"
base_name = f"{date_str}__{safe_ws}__{safe_title}__{sid[:8]}"
if args.format in ("markdown", "both"):
md_path = out_dir / f"{base_name}.md"
md_content = session_to_markdown(session, workspace_name)
md_path.write_text(md_content)
if args.format in ("json", "both"):
json_path = out_dir / f"{base_name}.json"
json_data = session_to_json(session, workspace_name)
json_path.write_text(json.dumps(json_data, indent=2, ensure_ascii=False))
print(f" Exported: {title} ({num_turns} turns) [{workspace_name}]")
print(f"\nDone! Files written to: {out_dir.resolve()}")
# Summary stats
total_turns = sum(len(s.get("requests", [])) for _, s in sessions)
workspaces = set(ws for ws, _ in sessions)
print(f" Total sessions: {len(sessions)}")
print(f" Total turns: {total_turns}")
print(f" Workspaces: {len(workspaces)}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment