Forked from giladbarnea/export_chatgpt_chat_to_md.py
Created
September 1, 2025 10:43
-
-
Save eplord/65f7aa96307dfbaffd3cf78b69c862fb to your computer and use it in GitHub Desktop.
export_chatgpt_chat_to_md.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #! /usr/bin/env python3.12 | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import random | |
| import re | |
| import sys | |
| from pathlib import Path | |
| from typing import Literal, TypedDict | |
| class Node(TypedDict): | |
| id: str | |
| parent: str | None | |
| children: list[str] | |
| message: Message | None | |
| class Message(TypedDict): | |
| id: str | |
| author: Author | |
| content: UserEditableContext | TextContent | |
| metadata: dict | ReasoningMetadata | ExecutionOutputMetadata | |
| status: Literal["finished_successfully"] | |
| class Author(TypedDict): | |
| role: Literal["user", "assistant", "tool", "system"] | |
| name: Literal["python"] | None | |
| class UserEditableContext(TypedDict): | |
| content_type: Literal["user_editable_context"] | |
| user_profile: str | |
| user_instructions: str | |
| class TextContent(TypedDict): | |
| content_type: Literal["text"] | |
| parts: list[str] | |
| class ThoughtsContent(TypedDict): | |
| content_type: Literal["thoughts"] | |
| thoughts: list[Thought] | |
| source_analysis_msg_id: str | |
| class Thought(TypedDict): | |
| summary: str | |
| content: str | |
| class ReasoningMetadata(TypedDict): | |
| reasoning_status: Literal["is_reasoning", "reasoning_ended"] | |
| class CodeContent(TypedDict): | |
| content_type: Literal["code"] | |
| language: Literal["unknown"] | |
| text: str | |
| class ExecutionOutputContent(TypedDict): | |
| content_type: Literal["execution_output"] | |
| text: str | |
| class ExecutionOutputMetadata(TypedDict): | |
| aggregate_result: dict | |
| class ExecutionOutputAggregateResult(TypedDict): | |
| status: Literal["success"] | |
| run_id: str | |
| code: str | |
| final_expression_output: str | |
| # region ---[ Common Helpers ]--- | |
| def _write_branch_to_file( | |
| branch: list[dict], filename: str, title: str, no_thoughts: bool = False | |
| ): | |
| with open(filename, "w") as f: | |
| f.write(f"# {title}\n" + "=" * len(title) + "\n\n") | |
| current_author = None | |
| in_thoughts = False | |
| skip_next = False | |
| for i, node in enumerate(branch): | |
| if skip_next: | |
| skip_next = False | |
| continue | |
| message = node.get("message") | |
| if not message: | |
| continue | |
| author = message.get("author", {}).get("role") | |
| if author == "system": | |
| continue # Skip hidden system messages | |
| content = message.get("content", {}) | |
| content_type = content.get("content_type") | |
| metadata = message.get("metadata", {}) | |
| reasoning_status = metadata.get("reasoning_status") | |
| if author != current_author: | |
| if author == "user": | |
| f.write("---\n\n# User\n\n") | |
| elif author == "assistant": | |
| f.write("---\n\n# Assistant\n\n") | |
| current_author = author | |
| if author == "user": | |
| for part in content.get("parts", []): | |
| match content_type: | |
| case "text": | |
| assert isinstance(part, str), ( | |
| f"Expected 'part' to be a str because content.content_type is 'text', got {type(part)}" | |
| ) | |
| f.write(f"{part}\n\n") | |
| continue | |
| match part.get("content_type"): | |
| case "real_time_user_audio_video_asset_pointer": | |
| duration = float( | |
| part.get("audio_asset_pointer", {}) | |
| .get("metadata", {}) | |
| .get("end") | |
| ) - float( | |
| part.get("audio_asset_pointer", {}) | |
| .get("metadata", {}) | |
| .get("start") | |
| ) | |
| f.write(f"🎙️ ({duration:.1f}s)\n\n") | |
| case "audio_transcription": | |
| f.write(f"{part.get('text')}\n\n") | |
| case _: | |
| print( | |
| f"⚠️ Unknown user part type: {part.get('content_type')}" | |
| ) | |
| elif author == "assistant": | |
| if no_thoughts and reasoning_status == "is_reasoning": | |
| continue | |
| if ( | |
| reasoning_status == "is_reasoning" | |
| and content_type == "thoughts" | |
| and not in_thoughts | |
| ): | |
| f.write("<thoughts>\n") | |
| in_thoughts = True | |
| if content_type == "thoughts": | |
| for thought in content.get("thoughts", []): | |
| f.write( | |
| f"* **{thought.get('summary')}**: {thought.get('content')}\n" | |
| ) | |
| f.write("\n") | |
| elif content_type == "code": | |
| f.write(f"```python\n{content.get('text', '')}\n```\n") | |
| if i + 1 < len(branch): | |
| next_node = branch[i + 1] | |
| next_message = next_node.get("message", {}) | |
| next_author = next_message.get("author", {}).get("role") | |
| next_content = next_message.get("content", {}) | |
| if ( | |
| next_author == "tool" | |
| and next_content.get("content_type") == "execution_output" | |
| ): | |
| f.write(f"{next_content.get('text', '')}\n\n\n") | |
| skip_next = True | |
| elif content_type == "text": | |
| parts = content.get("parts", []) | |
| if parts: | |
| f.write(f"{parts[0]}\n\n") | |
| elif content_type == "reasoning_recap": | |
| f.write(f"{content.get('content')}\n\n") | |
| elif content_type == "multimodal_text": | |
| for part in content.get("parts", []): | |
| match part.get("content_type"): | |
| case "real_time_user_audio_video_asset_pointer": | |
| duration = float( | |
| part.get("audio_asset_pointer", {}) | |
| .get("metadata", {}) | |
| .get("end") | |
| ) - float( | |
| part.get("audio_asset_pointer", {}) | |
| .get("metadata", {}) | |
| .get("start") | |
| ) | |
| f.write(f"🎙️ ({duration:.1f}s)\n\n") | |
| case "audio_transcription": | |
| f.write(f"{part.get('text')}\n\n") | |
| case "audio_asset_pointer": | |
| pass | |
| case _: | |
| print( | |
| f"⚠️ Unknown multimodal_text part type: {part.get('content_type')}" | |
| ) | |
| else: | |
| non_empty_keys = { | |
| k for k, v in content.items() if v and k != "content_type" | |
| } | |
| if non_empty_keys: | |
| print( | |
| f"⚠️ Unknown content type: {content_type}. Non-empty keys: {non_empty_keys}" | |
| ) | |
| if in_thoughts and ( | |
| reasoning_status == "reasoning_ended" or content_type != "thoughts" | |
| ): | |
| f.write("</thoughts>\n\n") | |
| in_thoughts = False | |
| else: | |
| print(f"⚠️ Unknown author: {author}") | |
| print(f"Wrote {len(branch)} nodes to {filename}") | |
| def _traverse_branches( | |
| node_id: str, data: dict, path: list[dict], all_branches: list[list[dict]] | |
| ) -> None: | |
| """Populates all_branches with all message lists in place.""" | |
| path.append(data["mapping"][node_id]) | |
| node = data["mapping"][node_id] | |
| children: list = node.get("children", []) | |
| if not children: | |
| all_branches.append(list(path)) | |
| else: | |
| for child_id in children: | |
| _traverse_branches(child_id, data, path, all_branches) | |
| path.pop() | |
| def _collect_path_to_root(data: dict, current_node_id: str) -> list[dict]: | |
| path: list[dict] = [] | |
| # Climb up the tree to the root node and collect the nodes in the path. | |
| while current_node_id: | |
| node = data["mapping"].get(current_node_id) | |
| if not node: | |
| break | |
| path.append(node) | |
| current_node_id = node.get("parent") | |
| path.reverse() | |
| return path | |
| def _replace_uuids(input_path: Path | str) -> str | FileNotFoundError | ValueError: | |
| WORDS = [ | |
| "Abscond", | |
| "Absurdist", | |
| "Adventure", | |
| "Alacrity", | |
| "Algorithm", | |
| "Allegory", | |
| "Altruism", | |
| "Ambivalent", | |
| "Ameliorate", | |
| "Amethyst", | |
| "Anthropological", | |
| "Archeological", | |
| "Artificial", | |
| "Astronomical", | |
| "Auburn", | |
| "Augmented", | |
| "Baleen", | |
| "Ballad", | |
| "Ballet", | |
| "Baroque", | |
| "Benevolent", | |
| "Bicycle", | |
| "Bilk", | |
| "Biological", | |
| "Breeze", | |
| "Bubble", | |
| "Burlesque", | |
| "Cacophony", | |
| "Cadence", | |
| "Cajole", | |
| "Capricious", | |
| "Carousel", | |
| "Cascade", | |
| "Catalyst", | |
| "Cavalier", | |
| "Chastise", | |
| "Chiaroscuro", | |
| "Chocolate", | |
| "Chrysalis", | |
| "Classicism", | |
| "Cobalt", | |
| "Colonial", | |
| "Comedy", | |
| "Concerto", | |
| "Conundrum", | |
| "Convivial", | |
| "Copious", | |
| "Cosmological", | |
| "Courage", | |
| "Crimson", | |
| "Cubism", | |
| "Curiosity", | |
| "Dadaism", | |
| "Dazzle", | |
| "Deleterious", | |
| "Delineate", | |
| "Dewdrop", | |
| "Digital", | |
| "Discombobulate", | |
| "Dolphin", | |
| "Dragonfly", | |
| "Drama", | |
| "Drift", | |
| "Ebullient", | |
| "Echo", | |
| "Ecological", | |
| "Effusive", | |
| "Egalitarian", | |
| "Egotistical", | |
| "Egregious", | |
| "Elegy", | |
| "Elephant", | |
| "Ember", | |
| "Emerald", | |
| "Enigma", | |
| "Enlightenment", | |
| "Environmental", | |
| "Ephemeral", | |
| "Epic", | |
| "Epicurean", | |
| "Epiphany", | |
| "Euphemism", | |
| "Existential", | |
| "Expressionism", | |
| "Fable", | |
| "Facetious", | |
| "Farce", | |
| "Fathom", | |
| "Firefly", | |
| "Flicker", | |
| "Flourish", | |
| "Flummox", | |
| "Folklore", | |
| "Fossil", | |
| "Frivolous", | |
| "Futurism", | |
| "Garnet", | |
| "Garrulous", | |
| "Geological", | |
| "Giggle", | |
| "Glimmer", | |
| "Glimpse", | |
| "Gossamer", | |
| "Gothic", | |
| "Grandiloquent", | |
| "Gregarious", | |
| "Gusto", | |
| "Hackneyed", | |
| "Haiku", | |
| "Halcyon", | |
| "Hapless", | |
| "Harangue", | |
| "Harmony", | |
| "Hedonistic", | |
| "Horizon", | |
| "Humming", | |
| "Hush", | |
| "Hyperbole", | |
| "Iconoclast", | |
| "Idiom", | |
| "Idiosyncrasy", | |
| "Imbibe", | |
| "Impecunious", | |
| "Impressionism", | |
| "Incandescent", | |
| "Indigo", | |
| "Industrial", | |
| "Ineffable", | |
| "Innovation", | |
| "Insidious", | |
| "Integrity", | |
| "Irony", | |
| "Ivory", | |
| "Jasmine", | |
| "Jigsaw", | |
| "Jocular", | |
| "Jocund", | |
| "Jubilant", | |
| "Jubilation", | |
| "Jubilee", | |
| "Juxtapose", | |
| "Juxtaposition", | |
| "Kaleidoscope", | |
| "Keen", | |
| "Kintsugi", | |
| "Kite", | |
| "Kiwi", | |
| "Knoll", | |
| "Labyrinth", | |
| "Lackadaisical", | |
| "Laconic", | |
| "Lark", | |
| "Lavender", | |
| "Legend", | |
| "Lighthouse", | |
| "Limerick", | |
| "Liminal", | |
| "Lissom", | |
| "Lugubrious", | |
| "Lullaby", | |
| "Luminous", | |
| "Majestic", | |
| "Malevolent", | |
| "Malign", | |
| "Masticate", | |
| "Maximalism", | |
| "Meadow", | |
| "Melancholy", | |
| "Mellifluous", | |
| "Melodrama", | |
| "Metaphor", | |
| "Metaphysical", | |
| "Minimalism", | |
| "Mirage", | |
| "Mitigate", | |
| "Modernism", | |
| "Moonglade", | |
| "Mountain", | |
| "Mystery", | |
| "Myth", | |
| "Nary", | |
| "Natural", | |
| "Nebula", | |
| "Nectar", | |
| "Nefarious", | |
| "Nihilism", | |
| "Nimbus", | |
| "Noxious", | |
| "Nymph", | |
| "Obfuscate", | |
| "Obsequious", | |
| "Ode", | |
| "Onerous", | |
| "Onomatopoeia", | |
| "Opal", | |
| "Opaline", | |
| "Opera", | |
| "Orchid", | |
| "Organic", | |
| "Ostentatious", | |
| "Oxymoron", | |
| "Paradigm", | |
| "Paradox", | |
| "Paranormal", | |
| "Parody", | |
| "Parsimonious", | |
| "Pastiche", | |
| "Paucity", | |
| "Pebble", | |
| "Perfunctory", | |
| "Pernicious", | |
| "Petrichor", | |
| "Philosophical", | |
| "Pillow", | |
| "Plethora", | |
| "Ponder", | |
| "Poppy", | |
| "Postmodernism", | |
| "Prism", | |
| "Proverb", | |
| "Psychological", | |
| "Quagmire", | |
| "Quaint", | |
| "Quantize", | |
| "Quantum", | |
| "Quasar", | |
| "Querulous", | |
| "Quibble", | |
| "Quill", | |
| "Quixotic", | |
| "Radiant", | |
| "Rainbow", | |
| "Rancor", | |
| "Recalcitrant", | |
| "Renaissance", | |
| "Repudiate", | |
| "Resilience", | |
| "Rhapsody", | |
| "Ripple", | |
| "Rococo", | |
| "Romanticism", | |
| "Rustic", | |
| "Sagacious", | |
| "Salient", | |
| "Sapphire", | |
| "Sarcasm", | |
| "Sardonic", | |
| "Satire", | |
| "Serendipity", | |
| "Serene", | |
| "Simile", | |
| "Sociological", | |
| "Solipsism", | |
| "Solstice", | |
| "Sonnet", | |
| "Sparkle", | |
| "Starlight", | |
| "Stoic", | |
| "Stymie", | |
| "Sunshine", | |
| "Supernatural", | |
| "Surrealism", | |
| "Sway", | |
| "Sycophant", | |
| "Symphony", | |
| "Synthetic", | |
| "Taciturn", | |
| "Tapestry", | |
| "Tautology", | |
| "Tender", | |
| "Theological", | |
| "Toady", | |
| "Tragedy", | |
| "Tranquil", | |
| "Transcendent", | |
| "Trepidation", | |
| "Twilight", | |
| "Ubiquitous", | |
| "Umbrella", | |
| "Unctuous", | |
| "Utopia", | |
| "Velvet", | |
| "Vexatious", | |
| "Vicarious", | |
| "Vicissitude", | |
| "Victorian", | |
| "Virtual", | |
| "Vivid", | |
| "Vortex", | |
| "Wander", | |
| "Wanderlust", | |
| "Wanton", | |
| "Watermelon", | |
| "Whisker", | |
| "Whisper", | |
| "Willow", | |
| "Wily", | |
| "Xenodochial", | |
| "Xenon", | |
| "Xenophobia", | |
| "Xylophone", | |
| "Yacht", | |
| "Yawn", | |
| "Yearn", | |
| "Yield", | |
| "Zealous", | |
| "Zenith", | |
| "Zephyr", | |
| "Zest", | |
| "Zigzag", | |
| "Zinnia", | |
| ] | |
| try: | |
| input_content = Path(input_path).read_text() | |
| except FileNotFoundError: | |
| return FileNotFoundError(f"Input file not found at '{input_path}'") | |
| uuid_re = re.compile("[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}") | |
| all_uuids_in_input = re.findall(uuid_re, input_content) | |
| unique_uuids = list(set(all_uuids_in_input)) | |
| # Ensure there are enough unique words for all the unique UUIDs | |
| if len(WORDS) < len(unique_uuids): | |
| error = ( | |
| f"Error: Not enough unique words in '{WORDS}'. " | |
| f"Need {len(unique_uuids)}, but only {len(WORDS)} words are available." | |
| ) | |
| return ValueError(error) | |
| # Assign a unique random word to each UUID by shuffling the word list | |
| # and creating a mapping dictionary for efficient lookups. | |
| random.shuffle(WORDS) | |
| uuid_to_word_map = dict(zip(unique_uuids, WORDS)) | |
| # For efficiency, build a single regex that matches any of the UUIDs. | |
| # re.escape is used to safely handle any special regex characters in UUIDs. | |
| pattern = re.compile("|".join(re.escape(uuid) for uuid in uuid_to_word_map)) | |
| # Perform the replacement in a single pass. The lambda function looks up | |
| # the matched UUID and returns its corresponding word. | |
| modified_content = pattern.sub( | |
| lambda match: uuid_to_word_map[match.group(0)], input_content | |
| ) | |
| return modified_content | |
| # region ---[ Convert ]--- | |
| def convert( | |
| json_file: Path | str, | |
| markdown_file: str, | |
| branches: bool, | |
| replace_uuids: bool, | |
| no_thoughts: bool = False, | |
| ) -> None: | |
| json_file = Path(json_file) | |
| if replace_uuids: | |
| data = _replace_uuids(json_file) | |
| else: | |
| data = json_file.read_text() | |
| data = json.loads(data) | |
| if branches: | |
| root_node_id = None | |
| for node_id, node in data["mapping"].items(): | |
| if node.get("parent") is None: | |
| root_node_id = node_id | |
| break | |
| if root_node_id: | |
| all_branches = [] | |
| _traverse_branches(root_node_id, data, [], all_branches) | |
| for i, branch in enumerate(all_branches): | |
| filename = f"{markdown_file.replace('.md', '')}_branch_{i + 1}.md" | |
| _write_branch_to_file( | |
| branch, | |
| filename, | |
| data.get("title", "Conversation"), | |
| no_thoughts=no_thoughts, | |
| ) | |
| else: | |
| _convert_conversation_to_markdown( | |
| json_file, | |
| markdown_file, | |
| no_thoughts=no_thoughts, | |
| ) | |
| def _convert_conversation_to_markdown( | |
| json_file: str, markdown_file: str, no_thoughts: bool = False | |
| ) -> None: | |
| with open(json_file, "r") as f: | |
| data = json.load(f) | |
| # current_node is the bottom-most node in the conversation. | |
| current_node_id = data.get("current_node") | |
| # Climb up the tree to the root node and collect the nodes in the path. | |
| path = _collect_path_to_root(data, current_node_id) | |
| _write_branch_to_file( | |
| path, markdown_file, data.get("title", "Conversation"), no_thoughts=no_thoughts | |
| ) | |
| # region ---[ Pick ]--- | |
| def pick_branch( | |
| json_file: Path | str, | |
| output_file: str, | |
| message_id_in_branch: str | None = None, | |
| replace_uuids: bool = False, | |
| no_thoughts: bool = False, | |
| ) -> None: | |
| json_file = Path(json_file) | |
| if replace_uuids: | |
| data = _replace_uuids(json_file) | |
| else: | |
| data = json_file.read_text() | |
| data = json.loads(data) | |
| title = data.get("title", "Conversation") | |
| if message_id_in_branch is None: | |
| # Pluck main branch from current_node | |
| current_node_id = data.get("current_node") | |
| path = _collect_path_to_root(data, current_node_id) | |
| # No need to collect children, because current_node is the bottom-most node. | |
| _write_branch_to_file(path, output_file, title, no_thoughts=no_thoughts) | |
| return | |
| # Generate all branches and select the specified one | |
| up_to_root = _collect_path_to_root(data, message_id_in_branch) | |
| children = [] | |
| _traverse_branches(message_id_in_branch, data, [], children) | |
| down_to_bottom = children[0] | |
| assert down_to_bottom[0].get("id") == up_to_root[-1].get("id") | |
| entire_branch = [*up_to_root, *down_to_bottom[1:]] | |
| _write_branch_to_file(entire_branch, output_file, title, no_thoughts=no_thoughts) | |
| # region ---[ CLI ]--- | |
| def main_cli(): | |
| CONVERT_COMMAND = "convert" | |
| PICK_COMMAND = "pick" | |
| if not any(arg in [CONVERT_COMMAND, PICK_COMMAND] for arg in sys.argv[1:]): | |
| sys.argv.insert(1, CONVERT_COMMAND) | |
| parser = argparse.ArgumentParser( | |
| description="Convert conversation JSON to Markdown or pluck node." | |
| ) | |
| subparsers = parser.add_subparsers(dest="command", required=True) | |
| # Default (convert) subparser | |
| convert_parser = subparsers.add_parser( | |
| CONVERT_COMMAND, help="Convert JSON to Markdown (default)" | |
| ) | |
| convert_parser.add_argument("json_file", help="The input JSON file.") | |
| convert_parser.add_argument("markdown_file", help="The output Markdown file.") | |
| convert_parser.add_argument( | |
| "-b", | |
| "--branches", | |
| action="store_true", | |
| help="Export all conversation branches to individual files.", | |
| ) | |
| # Pluck subparser | |
| pluck_parser = subparsers.add_parser( | |
| PICK_COMMAND, help="Pluck a branch to Markdown file." | |
| ) | |
| pluck_parser.add_argument("json_file", help="The input JSON file.") | |
| pluck_parser.add_argument("output_file", help="The output Markdown file.") | |
| pluck_parser.add_argument( | |
| "-m", | |
| "--message", | |
| type=str, | |
| default=None, | |
| help="Optional message ID which the target branch contains.", | |
| ) | |
| for subparser in subparsers.choices.values(): | |
| subparser.add_argument( | |
| "--replace-uuids", | |
| action="store_true", | |
| help="Replace UUIDs in the output with random words.", | |
| ) | |
| subparser.add_argument( | |
| "--no-thoughts", | |
| action="store_true", | |
| help="Exclude thoughts in the output.", | |
| ) | |
| args = parser.parse_args() | |
| replace_uuids: bool = args.replace_uuids | |
| no_thoughts: bool = args.no_thoughts | |
| if args.command == CONVERT_COMMAND: | |
| json_file = args.json_file | |
| markdown_file = args.markdown_file | |
| branches = args.branches | |
| convert( | |
| json_file, markdown_file, branches, replace_uuids, no_thoughts=no_thoughts | |
| ) | |
| elif args.command == PICK_COMMAND: | |
| pick_branch( | |
| args.json_file, | |
| args.output_file, | |
| args.message, | |
| replace_uuids=replace_uuids, | |
| no_thoughts=no_thoughts, | |
| ) | |
| if __name__ == "__main__": | |
| main_cli() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment