jshahbazi/extract_copilot_chats.py

## extract_copilot_chats.py
#!/usr/bin/env python3
"""
Extract all VS Code GitHub Copilot Chat conversations.

Scans both global and per-workspace storage to find all chat sessions,
then exports them as readable Markdown files in an output directory.

Usage:
    python3 extract_copilot_chats.py [--output-dir ./copilot_chats] [--format markdown|json]

Storage locations (macOS):
    ~/Library/Application Support/Code/User/globalStorage/state.vscdb
    ~/Library/Application Support/Code/User/globalStorage/emptyWindowChatSessions/
    ~/Library/Application Support/Code/User/workspaceStorage/<hash>/state.vscdb
    ~/Library/Application Support/Code/User/workspaceStorage/<hash>/chatSessions/

Format:
    ```
    {DATE}__{WORKSPACE}__{TITLE}__{SESSION_ID}.{ext}
    ```

    **Segments are separated by double underscores (`__`).** There are exactly 4 segments:

    | # | Segment | Format | Example |
    |---|---------|--------|---------|
    | 1 | `DATE` | `YYYY-MM-DD` | `2026-03-07` |
    | 2 | `WORKSPACE` | Sanitized folder name (spaces→`_`, special chars removed) | `angry-allen` |
    | 3 | `TITLE` | Sanitized chat title (spaces→`_`, special chars removed, max 80 chars) | `Sorting_files_by_date_using_ll_command` |
    | 4 | `SESSION_ID` | First 8 chars of the UUID session ID | `6561947e` |

    **Extensions:** `.md` (Markdown) and/or `.json` (structured JSON)

    **Parsing regex:**
    ```python
    import re
    pattern = r'^(\\d{4}-\\d{2}-\\d{2})__(.+?)__(.+?)__([a-f0-9]{8})\\.(md|json)$'
    match = re.match(pattern, filename)
    date, workspace, title, session_id, ext = match.groups()
    ```

    **JSON file structure** (easier to work with programmatically):
    ```json
    {
    "sessionId": "6561947e-b913-...",
    "title": "Sorting files by date using ll command",
    "workspace": "angry-allen",
    "created": "2026-03-07 18:08:37",
    "turns": [
    {
        "timestamp": "2026-03-07 18:36:23",
        "model": "copilot/gpt-5.3-codex",
        "user": "ll command to sort by date",
        "assistant": "Use:\n```bash\nll -t\n```\n..."
    }
    ]
    }
    ```

    **Sorting:** Files naturally sort chronologically by date when listed alphabetically. Within the same date, they sort by workspace then title.

**Recommendation for an agent:** Point it at the `.json` files rather than parsing Markdown — the JSON has the same data in a structured format with `turns[].user` and `turns[].assistant` fields ready to summarize.
"""

import argparse
import json
import os
import re
import sqlite3
import sys
from datetime import datetime
from pathlib import Path


def get_vscode_base() -> Path:
    """Return the VS Code user data directory for the current platform."""
    if sys.platform == "darwin":
        return Path.home() / "Library" / "Application Support" / "Code" / "User"
    elif sys.platform == "win32":
        return Path(os.environ.get("APPDATA", "")) / "Code" / "User"
    else:  # Linux
        return Path.home() / ".config" / "Code" / "User"


def read_db_key(db_path: Path, key: str):
    """Read a single key from a VS Code state.vscdb SQLite database."""
    if not db_path.exists():
        return None
    try:
        conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
        cur = conn.cursor()
        cur.execute("SELECT value FROM ItemTable WHERE key = ?", (key,))
        row = cur.fetchone()
        conn.close()
        if row:
            return json.loads(row[0])
    except Exception:
        pass
    return None


def read_db_keys_like(db_path: Path, pattern: str):
    """Read all keys matching a LIKE pattern from state.vscdb."""
    if not db_path.exists():
        return {}
    try:
        conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
        cur = conn.cursor()
        cur.execute("SELECT key, value FROM ItemTable WHERE key LIKE ?", (pattern,))
        rows = cur.fetchall()
        conn.close()
        result = {}
        for key, val in rows:
            try:
                result[key] = json.loads(val)
            except json.JSONDecodeError:
                result[key] = val
        return result
    except Exception:
        return {}


def get_workspace_name(ws_dir: Path) -> str:
    """Get the workspace folder name from workspace.json."""
    ws_json = ws_dir / "workspace.json"
    if ws_json.exists():
        try:
            data = json.loads(ws_json.read_text())
            folder = data.get("folder", "")
            # Extract just the folder name from the URI
            if folder.startswith("file:///"):
                return folder[7:].rstrip("/").split("/")[-1]
            return folder
        except Exception:
            pass
    return ws_dir.name[:12]


def parse_session_jsonl(jsonl_path: Path) -> dict:
    """Parse a chat session .jsonl file into a structured dict.

    JSONL format:
        Line 0 (kind=0): Session metadata (version, creationDate, sessionId, ...)
        Lines with kind=1: Property patches (e.g., customTitle)
        Lines with kind=2: Array patches (requests with user messages + responses)
    """
    if not jsonl_path.exists():
        return None

    session = {
        "sessionId": jsonl_path.stem,
        "title": None,
        "creationDate": None,
        "requests": [],
    }

    try:
        with open(jsonl_path) as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                entry = json.loads(line)
                kind = entry.get("kind")

                if kind == 0:
                    # Session metadata
                    v = entry.get("v", {})
                    session["creationDate"] = v.get("creationDate")
                    session["sessionId"] = v.get("sessionId", session["sessionId"])

                elif kind == 1:
                    # Property patch
                    keys = entry.get("k", [])
                    val = entry.get("v")
                    if keys == ["customTitle"] and isinstance(val, str):
                        session["title"] = val

                elif kind == 2:
                    # Array patch (requests)
                    keys = entry.get("k", [])
                    val = entry.get("v")
                    if keys == ["requests"] and isinstance(val, list):
                        session["requests"].extend(val)
    except Exception as e:
        print(f"  Warning: Failed to parse {jsonl_path}: {e}", file=sys.stderr)
        return None

    return session


def extract_response_text(response_parts: list) -> str:
    """Extract readable text from response parts."""
    texts = []
    for part in response_parts:
        if isinstance(part, dict):
            # MarkdownString response
            if "value" in part and isinstance(part["value"], str):
                val = part["value"]
                # Skip empty or tool-only responses
                if val.strip() and not val.startswith("{"):
                    texts.append(val)
            # Structured text response
            elif "content" in part and isinstance(part["content"], list):
                for c in part["content"]:
                    if isinstance(c, dict) and "value" in c:
                        texts.append(c["value"])
        elif isinstance(part, str):
            texts.append(part)
    return "\n".join(texts)


def extract_tool_calls(response_parts: list) -> list:
    """Extract tool call info from response parts."""
    tools = []
    for part in response_parts:
        if isinstance(part, dict):
            kind = part.get("kind")
            if kind == "toolCall":
                tool_name = part.get("toolName", part.get("name", "unknown"))
                tools.append(tool_name)
    return tools


def format_timestamp(ts) -> str:
    """Format a millisecond timestamp to readable date string."""
    if not ts:
        return "unknown date"
    try:
        dt = datetime.fromtimestamp(ts / 1000)
        return dt.strftime("%Y-%m-%d %H:%M:%S")
    except Exception:
        return str(ts)


def session_to_markdown(session: dict, workspace_name: str = "") -> str:
    """Convert a parsed session to a readable Markdown document."""
    lines = []
    title = session.get("title") or "Untitled Chat"
    created = format_timestamp(session.get("creationDate"))

    lines.append(f"# {title}")
    lines.append("")
    if workspace_name:
        lines.append(f"**Workspace:** {workspace_name}")
    lines.append(f"**Created:** {created}")
    lines.append(f"**Session ID:** {session.get('sessionId', 'unknown')}")
    lines.append("")
    lines.append("---")
    lines.append("")

    requests = session.get("requests", [])
    if not requests:
        lines.append("*(Empty conversation)*")
        return "\n".join(lines)

    for i, req in enumerate(requests, 1):
        # User message
        message = req.get("message", {})
        if isinstance(message, dict):
            user_text = message.get("text", "")
        else:
            user_text = str(message)

        timestamp = format_timestamp(req.get("timestamp"))
        model_id = req.get("modelId", "")

        lines.append(f"## Turn {i}")
        lines.append("")
        if model_id:
            lines.append(f"*Model: {model_id} | {timestamp}*")
            lines.append("")

        lines.append("### User")
        lines.append("")
        lines.append(user_text if user_text else "*(empty message)*")
        lines.append("")

        # Assistant response
        response = req.get("response", [])
        if isinstance(response, list):
            response_text = extract_response_text(response)
            tool_calls = extract_tool_calls(response)
        else:
            response_text = str(response) if response else ""
            tool_calls = []

        lines.append("### Assistant")
        lines.append("")
        if tool_calls:
            lines.append(f"*Tool calls: {", ".join(tool_calls)}*")
            lines.append("")
        lines.append(response_text if response_text else "*(no response text)*")
        lines.append("")
        lines.append("---")
        lines.append("")

    return "\n".join(lines)


def session_to_json(session: dict, workspace_name: str = "") -> dict:
    """Convert a parsed session to a clean JSON-safe dict."""
    requests = session.get("requests", [])
    turns = []
    for req in requests:
        message = req.get("message", {})
        user_text = message.get("text", "") if isinstance(message, dict) else str(message)

        response = req.get("response", [])
        response_text = extract_response_text(response) if isinstance(response, list) else str(response)

        turns.append({
            "timestamp": format_timestamp(req.get("timestamp")),
            "model": req.get("modelId", ""),
            "user": user_text,
            "assistant": response_text,
        })

    return {
        "sessionId": session.get("sessionId"),
        "title": session.get("title") or "Untitled Chat",
        "workspace": workspace_name,
        "created": format_timestamp(session.get("creationDate")),
        "turns": turns,
    }


def sanitize_filename(name: str, max_len: int = 80) -> str:
    """Make a string safe for use as a filename."""
    name = re.sub(r'[^\w\s\-.]', '', name)
    name = re.sub(r'\s+', '_', name.strip())
    return name[:max_len] if name else "untitled"


def discover_all_sessions(base: Path) -> list:
    """Discover all chat sessions across all workspaces."""
    sessions = []

    # 1. Global empty-window sessions
    global_sessions_dir = base / "globalStorage" / "emptyWindowChatSessions"
    if global_sessions_dir.is_dir():
        for f in global_sessions_dir.iterdir():
            if f.suffix == ".jsonl":
                session = parse_session_jsonl(f)
                if session:
                    sessions.append(("(no workspace)", session))

    # 2. Per-workspace sessions
    ws_base = base / "workspaceStorage"
    if ws_base.is_dir():
        for ws_dir in ws_base.iterdir():
            if not ws_dir.is_dir():
                continue

            workspace_name = get_workspace_name(ws_dir)
            chat_dir = ws_dir / "chatSessions"

            if chat_dir.is_dir():
                # Get session titles from the session index
                index = read_db_key(ws_dir / "state.vscdb", "chat.ChatSessionStore.index")
                title_map = {}
                if index and "entries" in index:
                    for sid, meta in index["entries"].items():
                        if "title" in meta:
                            title_map[sid] = meta["title"]

                for f in chat_dir.iterdir():
                    if f.suffix == ".jsonl":
                        session = parse_session_jsonl(f)
                        if session:
                            # Fill in title from index if not in JSONL
                            if not session.get("title") and session["sessionId"] in title_map:
                                session["title"] = title_map[session["sessionId"]]
                            sessions.append((workspace_name, session))

    return sessions


def main():
    parser = argparse.ArgumentParser(description="Extract VS Code Copilot Chat conversations")
    parser.add_argument("--output-dir", "-o", default="./copilot_chats",
                        help="Output directory (default: ./copilot_chats)")
    parser.add_argument("--format", "-f", choices=["markdown", "json", "both"], default="markdown",
                        help="Output format (default: markdown)")
    parser.add_argument("--list", "-l", action="store_true",
                        help="List sessions without extracting")
    parser.add_argument("--vscode-dir", default=None,
                        help="Override VS Code user data directory")
    args = parser.parse_args()

    base = Path(args.vscode_dir) if args.vscode_dir else get_vscode_base()
    if not base.exists():
        print(f"Error: VS Code data directory not found: {base}", file=sys.stderr)
        sys.exit(1)

    print(f"Scanning: {base}")
    sessions = discover_all_sessions(base)

    # Sort by creation date (newest first)
    sessions.sort(key=lambda x: x[1].get("creationDate") or 0, reverse=True)

    print(f"Found {len(sessions)} chat sessions\n")

    if args.list:
        for workspace_name, session in sessions:
            title = session.get("title") or "Untitled"
            created = format_timestamp(session.get("creationDate"))
            num_turns = len(session.get("requests", []))
            print(f"  [{workspace_name}] {title} ({num_turns} turns, {created})")
        return

    # Export sessions
    out_dir = Path(args.output_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    for workspace_name, session in sessions:
        title = session.get("title") or "Untitled"
        sid = session.get("sessionId", "unknown")
        num_turns = len(session.get("requests", []))

        if num_turns == 0:
            continue

        safe_title = sanitize_filename(title)
        safe_ws = sanitize_filename(workspace_name)
        # Format creation date as YYYY-MM-DD for the filename
        creation_ts = session.get("creationDate")
        if creation_ts:
            date_str = datetime.fromtimestamp(creation_ts / 1000).strftime("%Y-%m-%d")
        else:
            date_str = "unknown-date"
        base_name = f"{date_str}__{safe_ws}__{safe_title}__{sid[:8]}"

        if args.format in ("markdown", "both"):
            md_path = out_dir / f"{base_name}.md"
            md_content = session_to_markdown(session, workspace_name)
            md_path.write_text(md_content)

        if args.format in ("json", "both"):
            json_path = out_dir / f"{base_name}.json"
            json_data = session_to_json(session, workspace_name)
            json_path.write_text(json.dumps(json_data, indent=2, ensure_ascii=False))

        print(f"  Exported: {title} ({num_turns} turns) [{workspace_name}]")

    print(f"\nDone! Files written to: {out_dir.resolve()}")

    # Summary stats
    total_turns = sum(len(s.get("requests", [])) for _, s in sessions)
    workspaces = set(ws for ws, _ in sessions)
    print(f"  Total sessions: {len(sessions)}")
    print(f"  Total turns: {total_turns}")
    print(f"  Workspaces: {len(workspaces)}")


if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	Extract all VS Code GitHub Copilot Chat conversations.

	Scans both global and per-workspace storage to find all chat sessions,
	then exports them as readable Markdown files in an output directory.

	Usage:
	python3 extract_copilot_chats.py [--output-dir ./copilot_chats] [--format markdown\|json]

	Storage locations (macOS):
	~/Library/Application Support/Code/User/globalStorage/state.vscdb
	~/Library/Application Support/Code/User/globalStorage/emptyWindowChatSessions/
	~/Library/Application Support/Code/User/workspaceStorage/<hash>/state.vscdb
	~/Library/Application Support/Code/User/workspaceStorage/<hash>/chatSessions/

	Format:
	```
	{DATE}__{WORKSPACE}__{TITLE}__{SESSION_ID}.{ext}
	```

	Segments are separated by double underscores (`__`). There are exactly 4 segments:

	\| # \| Segment \| Format \| Example \|
	\|---\|---------\|--------\|---------\|
	\| 1 \| `DATE` \| `YYYY-MM-DD` \| `2026-03-07` \|
	\| 2 \| `WORKSPACE` \| Sanitized folder name (spaces→`_`, special chars removed) \| `angry-allen` \|
	\| 3 \| `TITLE` \| Sanitized chat title (spaces→`_`, special chars removed, max 80 chars) \| `Sorting_files_by_date_using_ll_command` \|
	\| 4 \| `SESSION_ID` \| First 8 chars of the UUID session ID \| `6561947e` \|

	Extensions: `.md` (Markdown) and/or `.json` (structured JSON)

	Parsing regex:
	```python
	import re
	pattern = r'^(\\d{4}-\\d{2}-\\d{2})__(.+?)__(.+?)__([a-f0-9]{8})\\.(md\|json)$'
	match = re.match(pattern, filename)
	date, workspace, title, session_id, ext = match.groups()
	```

	JSON file structure (easier to work with programmatically):
	```json
	{
	"sessionId": "6561947e-b913-...",
	"title": "Sorting files by date using ll command",
	"workspace": "angry-allen",
	"created": "2026-03-07 18:08:37",
	"turns": [
	{
	"timestamp": "2026-03-07 18:36:23",
	"model": "copilot/gpt-5.3-codex",
	"user": "ll command to sort by date",
	"assistant": "Use:\n```bash\nll -t\n```\n..."
	}
	]
	}
	```

	Sorting: Files naturally sort chronologically by date when listed alphabetically. Within the same date, they sort by workspace then title.

	Recommendation for an agent: Point it at the `.json` files rather than parsing Markdown — the JSON has the same data in a structured format with `turns[].user` and `turns[].assistant` fields ready to summarize.
	"""

	import argparse
	import json
	import os
	import re
	import sqlite3
	import sys
	from datetime import datetime
	from pathlib import Path


	def get_vscode_base() -> Path:
	"""Return the VS Code user data directory for the current platform."""
	if sys.platform == "darwin":
	return Path.home() / "Library" / "Application Support" / "Code" / "User"
	elif sys.platform == "win32":
	return Path(os.environ.get("APPDATA", "")) / "Code" / "User"
	else: # Linux
	return Path.home() / ".config" / "Code" / "User"


	def read_db_key(db_path: Path, key: str):
	"""Read a single key from a VS Code state.vscdb SQLite database."""
	if not db_path.exists():
	return None
	try:
	conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
	cur = conn.cursor()
	cur.execute("SELECT value FROM ItemTable WHERE key = ?", (key,))
	row = cur.fetchone()
	conn.close()
	if row:
	return json.loads(row[0])
	except Exception:
	pass
	return None


	def read_db_keys_like(db_path: Path, pattern: str):
	"""Read all keys matching a LIKE pattern from state.vscdb."""
	if not db_path.exists():
	return {}
	try:
	conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
	cur = conn.cursor()
	cur.execute("SELECT key, value FROM ItemTable WHERE key LIKE ?", (pattern,))
	rows = cur.fetchall()
	conn.close()
	result = {}
	for key, val in rows:
	try:
	result[key] = json.loads(val)
	except json.JSONDecodeError:
	result[key] = val
	return result
	except Exception:
	return {}


	def get_workspace_name(ws_dir: Path) -> str:
	"""Get the workspace folder name from workspace.json."""
	ws_json = ws_dir / "workspace.json"
	if ws_json.exists():
	try:
	data = json.loads(ws_json.read_text())
	folder = data.get("folder", "")
	# Extract just the folder name from the URI
	if folder.startswith("file:///"):
	return folder[7:].rstrip("/").split("/")[-1]
	return folder
	except Exception:
	pass
	return ws_dir.name[:12]


	def parse_session_jsonl(jsonl_path: Path) -> dict:
	"""Parse a chat session .jsonl file into a structured dict.

	JSONL format:
	Line 0 (kind=0): Session metadata (version, creationDate, sessionId, ...)
	Lines with kind=1: Property patches (e.g., customTitle)
	Lines with kind=2: Array patches (requests with user messages + responses)
	"""
	if not jsonl_path.exists():
	return None

	session = {
	"sessionId": jsonl_path.stem,
	"title": None,
	"creationDate": None,
	"requests": [],
	}

	try:
	with open(jsonl_path) as f:
	for line in f:
	line = line.strip()
	if not line:
	continue
	entry = json.loads(line)
	kind = entry.get("kind")

	if kind == 0:
	# Session metadata
	v = entry.get("v", {})
	session["creationDate"] = v.get("creationDate")
	session["sessionId"] = v.get("sessionId", session["sessionId"])

	elif kind == 1:
	# Property patch
	keys = entry.get("k", [])
	val = entry.get("v")
	if keys == ["customTitle"] and isinstance(val, str):
	session["title"] = val

	elif kind == 2:
	# Array patch (requests)
	keys = entry.get("k", [])
	val = entry.get("v")
	if keys == ["requests"] and isinstance(val, list):
	session["requests"].extend(val)
	except Exception as e:
	print(f" Warning: Failed to parse {jsonl_path}: {e}", file=sys.stderr)
	return None

	return session


	def extract_response_text(response_parts: list) -> str:
	"""Extract readable text from response parts."""
	texts = []
	for part in response_parts:
	if isinstance(part, dict):
	# MarkdownString response
	if "value" in part and isinstance(part["value"], str):
	val = part["value"]
	# Skip empty or tool-only responses
	if val.strip() and not val.startswith("{"):
	texts.append(val)
	# Structured text response
	elif "content" in part and isinstance(part["content"], list):
	for c in part["content"]:
	if isinstance(c, dict) and "value" in c:
	texts.append(c["value"])
	elif isinstance(part, str):
	texts.append(part)
	return "\n".join(texts)


	def extract_tool_calls(response_parts: list) -> list:
	"""Extract tool call info from response parts."""
	tools = []
	for part in response_parts:
	if isinstance(part, dict):
	kind = part.get("kind")
	if kind == "toolCall":
	tool_name = part.get("toolName", part.get("name", "unknown"))
	tools.append(tool_name)
	return tools


	def format_timestamp(ts) -> str:
	"""Format a millisecond timestamp to readable date string."""
	if not ts:
	return "unknown date"
	try:
	dt = datetime.fromtimestamp(ts / 1000)
	return dt.strftime("%Y-%m-%d %H:%M:%S")
	except Exception:
	return str(ts)


	def session_to_markdown(session: dict, workspace_name: str = "") -> str:
	"""Convert a parsed session to a readable Markdown document."""
	lines = []
	title = session.get("title") or "Untitled Chat"
	created = format_timestamp(session.get("creationDate"))

	lines.append(f"# {title}")
	lines.append("")
	if workspace_name:
	lines.append(f"Workspace: {workspace_name}")
	lines.append(f"Created: {created}")
	lines.append(f"Session ID: {session.get('sessionId', 'unknown')}")
	lines.append("")
	lines.append("---")
	lines.append("")

	requests = session.get("requests", [])
	if not requests:
	lines.append("(Empty conversation)")
	return "\n".join(lines)

	for i, req in enumerate(requests, 1):
	# User message
	message = req.get("message", {})
	if isinstance(message, dict):
	user_text = message.get("text", "")
	else:
	user_text = str(message)

	timestamp = format_timestamp(req.get("timestamp"))
	model_id = req.get("modelId", "")

	lines.append(f"## Turn {i}")
	lines.append("")
	if model_id:
	lines.append(f"Model: {model_id} \| {timestamp}")
	lines.append("")

	lines.append("### User")
	lines.append("")
	lines.append(user_text if user_text else "(empty message)")
	lines.append("")

	# Assistant response
	response = req.get("response", [])
	if isinstance(response, list):
	response_text = extract_response_text(response)
	tool_calls = extract_tool_calls(response)
	else:
	response_text = str(response) if response else ""
	tool_calls = []

	lines.append("### Assistant")
	lines.append("")
	if tool_calls:
	lines.append(f"Tool calls: {", ".join(tool_calls)}")
	lines.append("")
	lines.append(response_text if response_text else "(no response text)")
	lines.append("")
	lines.append("---")
	lines.append("")

	return "\n".join(lines)


	def session_to_json(session: dict, workspace_name: str = "") -> dict:
	"""Convert a parsed session to a clean JSON-safe dict."""
	requests = session.get("requests", [])
	turns = []
	for req in requests:
	message = req.get("message", {})
	user_text = message.get("text", "") if isinstance(message, dict) else str(message)

	response = req.get("response", [])
	response_text = extract_response_text(response) if isinstance(response, list) else str(response)

	turns.append({
	"timestamp": format_timestamp(req.get("timestamp")),
	"model": req.get("modelId", ""),
	"user": user_text,
	"assistant": response_text,
	})

	return {
	"sessionId": session.get("sessionId"),
	"title": session.get("title") or "Untitled Chat",
	"workspace": workspace_name,
	"created": format_timestamp(session.get("creationDate")),
	"turns": turns,
	}


	def sanitize_filename(name: str, max_len: int = 80) -> str:
	"""Make a string safe for use as a filename."""
	name = re.sub(r'[^\w\s\-.]', '', name)
	name = re.sub(r'\s+', '_', name.strip())
	return name[:max_len] if name else "untitled"


	def discover_all_sessions(base: Path) -> list:
	"""Discover all chat sessions across all workspaces."""
	sessions = []

	# 1. Global empty-window sessions
	global_sessions_dir = base / "globalStorage" / "emptyWindowChatSessions"
	if global_sessions_dir.is_dir():
	for f in global_sessions_dir.iterdir():
	if f.suffix == ".jsonl":
	session = parse_session_jsonl(f)
	if session:
	sessions.append(("(no workspace)", session))

	# 2. Per-workspace sessions
	ws_base = base / "workspaceStorage"
	if ws_base.is_dir():
	for ws_dir in ws_base.iterdir():
	if not ws_dir.is_dir():
	continue

	workspace_name = get_workspace_name(ws_dir)
	chat_dir = ws_dir / "chatSessions"

	if chat_dir.is_dir():
	# Get session titles from the session index
	index = read_db_key(ws_dir / "state.vscdb", "chat.ChatSessionStore.index")
	title_map = {}
	if index and "entries" in index:
	for sid, meta in index["entries"].items():
	if "title" in meta:
	title_map[sid] = meta["title"]

	for f in chat_dir.iterdir():
	if f.suffix == ".jsonl":
	session = parse_session_jsonl(f)
	if session:
	# Fill in title from index if not in JSONL
	if not session.get("title") and session["sessionId"] in title_map:
	session["title"] = title_map[session["sessionId"]]
	sessions.append((workspace_name, session))

	return sessions


	def main():
	parser = argparse.ArgumentParser(description="Extract VS Code Copilot Chat conversations")
	parser.add_argument("--output-dir", "-o", default="./copilot_chats",
	help="Output directory (default: ./copilot_chats)")
	parser.add_argument("--format", "-f", choices=["markdown", "json", "both"], default="markdown",
	help="Output format (default: markdown)")
	parser.add_argument("--list", "-l", action="store_true",
	help="List sessions without extracting")
	parser.add_argument("--vscode-dir", default=None,
	help="Override VS Code user data directory")
	args = parser.parse_args()

	base = Path(args.vscode_dir) if args.vscode_dir else get_vscode_base()
	if not base.exists():
	print(f"Error: VS Code data directory not found: {base}", file=sys.stderr)
	sys.exit(1)

	print(f"Scanning: {base}")
	sessions = discover_all_sessions(base)

	# Sort by creation date (newest first)
	sessions.sort(key=lambda x: x[1].get("creationDate") or 0, reverse=True)

	print(f"Found {len(sessions)} chat sessions\n")

	if args.list:
	for workspace_name, session in sessions:
	title = session.get("title") or "Untitled"
	created = format_timestamp(session.get("creationDate"))
	num_turns = len(session.get("requests", []))
	print(f" [{workspace_name}] {title} ({num_turns} turns, {created})")
	return

	# Export sessions
	out_dir = Path(args.output_dir)
	out_dir.mkdir(parents=True, exist_ok=True)

	for workspace_name, session in sessions:
	title = session.get("title") or "Untitled"
	sid = session.get("sessionId", "unknown")
	num_turns = len(session.get("requests", []))

	if num_turns == 0:
	continue

	safe_title = sanitize_filename(title)
	safe_ws = sanitize_filename(workspace_name)
	# Format creation date as YYYY-MM-DD for the filename
	creation_ts = session.get("creationDate")
	if creation_ts:
	date_str = datetime.fromtimestamp(creation_ts / 1000).strftime("%Y-%m-%d")
	else:
	date_str = "unknown-date"
	base_name = f"{date_str}__{safe_ws}__{safe_title}__{sid[:8]}"

	if args.format in ("markdown", "both"):
	md_path = out_dir / f"{base_name}.md"
	md_content = session_to_markdown(session, workspace_name)
	md_path.write_text(md_content)

	if args.format in ("json", "both"):
	json_path = out_dir / f"{base_name}.json"
	json_data = session_to_json(session, workspace_name)
	json_path.write_text(json.dumps(json_data, indent=2, ensure_ascii=False))

	print(f" Exported: {title} ({num_turns} turns) [{workspace_name}]")

	print(f"\nDone! Files written to: {out_dir.resolve()}")

	# Summary stats
	total_turns = sum(len(s.get("requests", [])) for _, s in sessions)
	workspaces = set(ws for ws, _ in sessions)
	print(f" Total sessions: {len(sessions)}")
	print(f" Total turns: {total_turns}")
	print(f" Workspaces: {len(workspaces)}")


	if __name__ == "__main__":
	main()
No results found