Last active
March 8, 2026 19:36
-
-
Save jshahbazi/66c1515897d9e9355ae921a34c4be479 to your computer and use it in GitHub Desktop.
Extract all VS Code GitHub Copilot Chat conversations and export them to Markdown
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Extract all VS Code GitHub Copilot Chat conversations. | |
| Scans both global and per-workspace storage to find all chat sessions, | |
| then exports them as readable Markdown files in an output directory. | |
| Usage: | |
| python3 extract_copilot_chats.py [--output-dir ./copilot_chats] [--format markdown|json] | |
| Storage locations (macOS): | |
| ~/Library/Application Support/Code/User/globalStorage/state.vscdb | |
| ~/Library/Application Support/Code/User/globalStorage/emptyWindowChatSessions/ | |
| ~/Library/Application Support/Code/User/workspaceStorage/<hash>/state.vscdb | |
| ~/Library/Application Support/Code/User/workspaceStorage/<hash>/chatSessions/ | |
| Format: | |
| ``` | |
| {DATE}__{WORKSPACE}__{TITLE}__{SESSION_ID}.{ext} | |
| ``` | |
| **Segments are separated by double underscores (`__`).** There are exactly 4 segments: | |
| | # | Segment | Format | Example | | |
| |---|---------|--------|---------| | |
| | 1 | `DATE` | `YYYY-MM-DD` | `2026-03-07` | | |
| | 2 | `WORKSPACE` | Sanitized folder name (spaces→`_`, special chars removed) | `angry-allen` | | |
| | 3 | `TITLE` | Sanitized chat title (spaces→`_`, special chars removed, max 80 chars) | `Sorting_files_by_date_using_ll_command` | | |
| | 4 | `SESSION_ID` | First 8 chars of the UUID session ID | `6561947e` | | |
| **Extensions:** `.md` (Markdown) and/or `.json` (structured JSON) | |
| **Parsing regex:** | |
| ```python | |
| import re | |
| pattern = r'^(\\d{4}-\\d{2}-\\d{2})__(.+?)__(.+?)__([a-f0-9]{8})\\.(md|json)$' | |
| match = re.match(pattern, filename) | |
| date, workspace, title, session_id, ext = match.groups() | |
| ``` | |
| **JSON file structure** (easier to work with programmatically): | |
| ```json | |
| { | |
| "sessionId": "6561947e-b913-...", | |
| "title": "Sorting files by date using ll command", | |
| "workspace": "angry-allen", | |
| "created": "2026-03-07 18:08:37", | |
| "turns": [ | |
| { | |
| "timestamp": "2026-03-07 18:36:23", | |
| "model": "copilot/gpt-5.3-codex", | |
| "user": "ll command to sort by date", | |
| "assistant": "Use:\n```bash\nll -t\n```\n..." | |
| } | |
| ] | |
| } | |
| ``` | |
| **Sorting:** Files naturally sort chronologically by date when listed alphabetically. Within the same date, they sort by workspace then title. | |
| **Recommendation for an agent:** Point it at the `.json` files rather than parsing Markdown — the JSON has the same data in a structured format with `turns[].user` and `turns[].assistant` fields ready to summarize. | |
| """ | |
| import argparse | |
| import json | |
| import os | |
| import re | |
| import sqlite3 | |
| import sys | |
| from datetime import datetime | |
| from pathlib import Path | |
| def get_vscode_base() -> Path: | |
| """Return the VS Code user data directory for the current platform.""" | |
| if sys.platform == "darwin": | |
| return Path.home() / "Library" / "Application Support" / "Code" / "User" | |
| elif sys.platform == "win32": | |
| return Path(os.environ.get("APPDATA", "")) / "Code" / "User" | |
| else: # Linux | |
| return Path.home() / ".config" / "Code" / "User" | |
| def read_db_key(db_path: Path, key: str): | |
| """Read a single key from a VS Code state.vscdb SQLite database.""" | |
| if not db_path.exists(): | |
| return None | |
| try: | |
| conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True) | |
| cur = conn.cursor() | |
| cur.execute("SELECT value FROM ItemTable WHERE key = ?", (key,)) | |
| row = cur.fetchone() | |
| conn.close() | |
| if row: | |
| return json.loads(row[0]) | |
| except Exception: | |
| pass | |
| return None | |
| def read_db_keys_like(db_path: Path, pattern: str): | |
| """Read all keys matching a LIKE pattern from state.vscdb.""" | |
| if not db_path.exists(): | |
| return {} | |
| try: | |
| conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True) | |
| cur = conn.cursor() | |
| cur.execute("SELECT key, value FROM ItemTable WHERE key LIKE ?", (pattern,)) | |
| rows = cur.fetchall() | |
| conn.close() | |
| result = {} | |
| for key, val in rows: | |
| try: | |
| result[key] = json.loads(val) | |
| except json.JSONDecodeError: | |
| result[key] = val | |
| return result | |
| except Exception: | |
| return {} | |
| def get_workspace_name(ws_dir: Path) -> str: | |
| """Get the workspace folder name from workspace.json.""" | |
| ws_json = ws_dir / "workspace.json" | |
| if ws_json.exists(): | |
| try: | |
| data = json.loads(ws_json.read_text()) | |
| folder = data.get("folder", "") | |
| # Extract just the folder name from the URI | |
| if folder.startswith("file:///"): | |
| return folder[7:].rstrip("/").split("/")[-1] | |
| return folder | |
| except Exception: | |
| pass | |
| return ws_dir.name[:12] | |
| def parse_session_jsonl(jsonl_path: Path) -> dict: | |
| """Parse a chat session .jsonl file into a structured dict. | |
| JSONL format: | |
| Line 0 (kind=0): Session metadata (version, creationDate, sessionId, ...) | |
| Lines with kind=1: Property patches (e.g., customTitle) | |
| Lines with kind=2: Array patches (requests with user messages + responses) | |
| """ | |
| if not jsonl_path.exists(): | |
| return None | |
| session = { | |
| "sessionId": jsonl_path.stem, | |
| "title": None, | |
| "creationDate": None, | |
| "requests": [], | |
| } | |
| try: | |
| with open(jsonl_path) as f: | |
| for line in f: | |
| line = line.strip() | |
| if not line: | |
| continue | |
| entry = json.loads(line) | |
| kind = entry.get("kind") | |
| if kind == 0: | |
| # Session metadata | |
| v = entry.get("v", {}) | |
| session["creationDate"] = v.get("creationDate") | |
| session["sessionId"] = v.get("sessionId", session["sessionId"]) | |
| elif kind == 1: | |
| # Property patch | |
| keys = entry.get("k", []) | |
| val = entry.get("v") | |
| if keys == ["customTitle"] and isinstance(val, str): | |
| session["title"] = val | |
| elif kind == 2: | |
| # Array patch (requests) | |
| keys = entry.get("k", []) | |
| val = entry.get("v") | |
| if keys == ["requests"] and isinstance(val, list): | |
| session["requests"].extend(val) | |
| except Exception as e: | |
| print(f" Warning: Failed to parse {jsonl_path}: {e}", file=sys.stderr) | |
| return None | |
| return session | |
| def extract_response_text(response_parts: list) -> str: | |
| """Extract readable text from response parts.""" | |
| texts = [] | |
| for part in response_parts: | |
| if isinstance(part, dict): | |
| # MarkdownString response | |
| if "value" in part and isinstance(part["value"], str): | |
| val = part["value"] | |
| # Skip empty or tool-only responses | |
| if val.strip() and not val.startswith("{"): | |
| texts.append(val) | |
| # Structured text response | |
| elif "content" in part and isinstance(part["content"], list): | |
| for c in part["content"]: | |
| if isinstance(c, dict) and "value" in c: | |
| texts.append(c["value"]) | |
| elif isinstance(part, str): | |
| texts.append(part) | |
| return "\n".join(texts) | |
| def extract_tool_calls(response_parts: list) -> list: | |
| """Extract tool call info from response parts.""" | |
| tools = [] | |
| for part in response_parts: | |
| if isinstance(part, dict): | |
| kind = part.get("kind") | |
| if kind == "toolCall": | |
| tool_name = part.get("toolName", part.get("name", "unknown")) | |
| tools.append(tool_name) | |
| return tools | |
| def format_timestamp(ts) -> str: | |
| """Format a millisecond timestamp to readable date string.""" | |
| if not ts: | |
| return "unknown date" | |
| try: | |
| dt = datetime.fromtimestamp(ts / 1000) | |
| return dt.strftime("%Y-%m-%d %H:%M:%S") | |
| except Exception: | |
| return str(ts) | |
| def session_to_markdown(session: dict, workspace_name: str = "") -> str: | |
| """Convert a parsed session to a readable Markdown document.""" | |
| lines = [] | |
| title = session.get("title") or "Untitled Chat" | |
| created = format_timestamp(session.get("creationDate")) | |
| lines.append(f"# {title}") | |
| lines.append("") | |
| if workspace_name: | |
| lines.append(f"**Workspace:** {workspace_name}") | |
| lines.append(f"**Created:** {created}") | |
| lines.append(f"**Session ID:** {session.get('sessionId', 'unknown')}") | |
| lines.append("") | |
| lines.append("---") | |
| lines.append("") | |
| requests = session.get("requests", []) | |
| if not requests: | |
| lines.append("*(Empty conversation)*") | |
| return "\n".join(lines) | |
| for i, req in enumerate(requests, 1): | |
| # User message | |
| message = req.get("message", {}) | |
| if isinstance(message, dict): | |
| user_text = message.get("text", "") | |
| else: | |
| user_text = str(message) | |
| timestamp = format_timestamp(req.get("timestamp")) | |
| model_id = req.get("modelId", "") | |
| lines.append(f"## Turn {i}") | |
| lines.append("") | |
| if model_id: | |
| lines.append(f"*Model: {model_id} | {timestamp}*") | |
| lines.append("") | |
| lines.append("### User") | |
| lines.append("") | |
| lines.append(user_text if user_text else "*(empty message)*") | |
| lines.append("") | |
| # Assistant response | |
| response = req.get("response", []) | |
| if isinstance(response, list): | |
| response_text = extract_response_text(response) | |
| tool_calls = extract_tool_calls(response) | |
| else: | |
| response_text = str(response) if response else "" | |
| tool_calls = [] | |
| lines.append("### Assistant") | |
| lines.append("") | |
| if tool_calls: | |
| lines.append(f"*Tool calls: {", ".join(tool_calls)}*") | |
| lines.append("") | |
| lines.append(response_text if response_text else "*(no response text)*") | |
| lines.append("") | |
| lines.append("---") | |
| lines.append("") | |
| return "\n".join(lines) | |
| def session_to_json(session: dict, workspace_name: str = "") -> dict: | |
| """Convert a parsed session to a clean JSON-safe dict.""" | |
| requests = session.get("requests", []) | |
| turns = [] | |
| for req in requests: | |
| message = req.get("message", {}) | |
| user_text = message.get("text", "") if isinstance(message, dict) else str(message) | |
| response = req.get("response", []) | |
| response_text = extract_response_text(response) if isinstance(response, list) else str(response) | |
| turns.append({ | |
| "timestamp": format_timestamp(req.get("timestamp")), | |
| "model": req.get("modelId", ""), | |
| "user": user_text, | |
| "assistant": response_text, | |
| }) | |
| return { | |
| "sessionId": session.get("sessionId"), | |
| "title": session.get("title") or "Untitled Chat", | |
| "workspace": workspace_name, | |
| "created": format_timestamp(session.get("creationDate")), | |
| "turns": turns, | |
| } | |
| def sanitize_filename(name: str, max_len: int = 80) -> str: | |
| """Make a string safe for use as a filename.""" | |
| name = re.sub(r'[^\w\s\-.]', '', name) | |
| name = re.sub(r'\s+', '_', name.strip()) | |
| return name[:max_len] if name else "untitled" | |
| def discover_all_sessions(base: Path) -> list: | |
| """Discover all chat sessions across all workspaces.""" | |
| sessions = [] | |
| # 1. Global empty-window sessions | |
| global_sessions_dir = base / "globalStorage" / "emptyWindowChatSessions" | |
| if global_sessions_dir.is_dir(): | |
| for f in global_sessions_dir.iterdir(): | |
| if f.suffix == ".jsonl": | |
| session = parse_session_jsonl(f) | |
| if session: | |
| sessions.append(("(no workspace)", session)) | |
| # 2. Per-workspace sessions | |
| ws_base = base / "workspaceStorage" | |
| if ws_base.is_dir(): | |
| for ws_dir in ws_base.iterdir(): | |
| if not ws_dir.is_dir(): | |
| continue | |
| workspace_name = get_workspace_name(ws_dir) | |
| chat_dir = ws_dir / "chatSessions" | |
| if chat_dir.is_dir(): | |
| # Get session titles from the session index | |
| index = read_db_key(ws_dir / "state.vscdb", "chat.ChatSessionStore.index") | |
| title_map = {} | |
| if index and "entries" in index: | |
| for sid, meta in index["entries"].items(): | |
| if "title" in meta: | |
| title_map[sid] = meta["title"] | |
| for f in chat_dir.iterdir(): | |
| if f.suffix == ".jsonl": | |
| session = parse_session_jsonl(f) | |
| if session: | |
| # Fill in title from index if not in JSONL | |
| if not session.get("title") and session["sessionId"] in title_map: | |
| session["title"] = title_map[session["sessionId"]] | |
| sessions.append((workspace_name, session)) | |
| return sessions | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Extract VS Code Copilot Chat conversations") | |
| parser.add_argument("--output-dir", "-o", default="./copilot_chats", | |
| help="Output directory (default: ./copilot_chats)") | |
| parser.add_argument("--format", "-f", choices=["markdown", "json", "both"], default="markdown", | |
| help="Output format (default: markdown)") | |
| parser.add_argument("--list", "-l", action="store_true", | |
| help="List sessions without extracting") | |
| parser.add_argument("--vscode-dir", default=None, | |
| help="Override VS Code user data directory") | |
| args = parser.parse_args() | |
| base = Path(args.vscode_dir) if args.vscode_dir else get_vscode_base() | |
| if not base.exists(): | |
| print(f"Error: VS Code data directory not found: {base}", file=sys.stderr) | |
| sys.exit(1) | |
| print(f"Scanning: {base}") | |
| sessions = discover_all_sessions(base) | |
| # Sort by creation date (newest first) | |
| sessions.sort(key=lambda x: x[1].get("creationDate") or 0, reverse=True) | |
| print(f"Found {len(sessions)} chat sessions\n") | |
| if args.list: | |
| for workspace_name, session in sessions: | |
| title = session.get("title") or "Untitled" | |
| created = format_timestamp(session.get("creationDate")) | |
| num_turns = len(session.get("requests", [])) | |
| print(f" [{workspace_name}] {title} ({num_turns} turns, {created})") | |
| return | |
| # Export sessions | |
| out_dir = Path(args.output_dir) | |
| out_dir.mkdir(parents=True, exist_ok=True) | |
| for workspace_name, session in sessions: | |
| title = session.get("title") or "Untitled" | |
| sid = session.get("sessionId", "unknown") | |
| num_turns = len(session.get("requests", [])) | |
| if num_turns == 0: | |
| continue | |
| safe_title = sanitize_filename(title) | |
| safe_ws = sanitize_filename(workspace_name) | |
| # Format creation date as YYYY-MM-DD for the filename | |
| creation_ts = session.get("creationDate") | |
| if creation_ts: | |
| date_str = datetime.fromtimestamp(creation_ts / 1000).strftime("%Y-%m-%d") | |
| else: | |
| date_str = "unknown-date" | |
| base_name = f"{date_str}__{safe_ws}__{safe_title}__{sid[:8]}" | |
| if args.format in ("markdown", "both"): | |
| md_path = out_dir / f"{base_name}.md" | |
| md_content = session_to_markdown(session, workspace_name) | |
| md_path.write_text(md_content) | |
| if args.format in ("json", "both"): | |
| json_path = out_dir / f"{base_name}.json" | |
| json_data = session_to_json(session, workspace_name) | |
| json_path.write_text(json.dumps(json_data, indent=2, ensure_ascii=False)) | |
| print(f" Exported: {title} ({num_turns} turns) [{workspace_name}]") | |
| print(f"\nDone! Files written to: {out_dir.resolve()}") | |
| # Summary stats | |
| total_turns = sum(len(s.get("requests", [])) for _, s in sessions) | |
| workspaces = set(ws for ws, _ in sessions) | |
| print(f" Total sessions: {len(sessions)}") | |
| print(f" Total turns: {total_turns}") | |
| print(f" Workspaces: {len(workspaces)}") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment