Last active
February 12, 2025 05:40
-
-
Save Theigrams/beeec85e6cf6f90b38331e32ee32d979 to your computer and use it in GitHub Desktop.
将 DeepResearch Json 转成 markdown
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import json | |
| import re | |
| from dataclasses import dataclass | |
| from datetime import datetime | |
| from typing import Dict, List, Optional | |
| from urllib.parse import urlparse, urlunparse | |
| @dataclass | |
| class ContentReference: | |
| """Content reference data structure""" | |
| alt: Optional[str] | |
| attribution: str | |
| start_idx: int | |
| end_idx: int | |
| matched_text: str | |
| pub_date: Optional[str] | |
| snippet: str | |
| title: str | |
| type: str | |
| url: str | |
| @property | |
| def clean_url(self) -> str: | |
| """Remove text fragment identifier from URL""" | |
| parsed = urlparse(self.url) | |
| # Remove fragment | |
| cleaned = parsed._replace(fragment="") | |
| return urlunparse(cleaned) | |
| @dataclass | |
| class Message: | |
| """Message data structure""" | |
| id: str | |
| create_time: float | |
| role: str | |
| content: str | |
| references: List[ContentReference] | |
| is_hidden: bool = False | |
| @property | |
| def formatted_time(self) -> str: | |
| """Convert Unix timestamp to formatted datetime string""" | |
| try: | |
| return datetime.fromtimestamp(self.create_time).strftime("%Y-%m-%d %H:%M:%S") | |
| except Exception: | |
| return "Unknown time" | |
| def extract_reference_numbers(matched_text: str) -> tuple[int, Optional[int], Optional[int]]: | |
| """ | |
| Extract message index and line numbers from reference format: | |
| 【msg_idx†Lstart_line-Lend_line】 | |
| """ | |
| pattern = r"【(\d+)†L(\d+)-L(\d+)】" | |
| # 使用 re.search 保证可以匹配到文本中任意位置的参考文献 | |
| match = re.search(pattern, matched_text) | |
| if match: | |
| msg_idx = int(match.group(1)) | |
| start_line = int(match.group(2)) | |
| end_line = int(match.group(3)) | |
| return msg_idx, start_line, end_line | |
| return -1, None, None | |
| def process_content_references(content: str, references: List[ContentReference]) -> str: | |
| """ | |
| Process content to convert reference format to markdown links. | |
| 将【编号†L起始行-L结束行】替换为 [【编号†L起始行-L结束行】](raw_url) | |
| """ | |
| # 创建一个以 matched_text 为 key 的映射,方便快速替换 | |
| ref_map = {ref.matched_text: ref for ref in references} | |
| for matched_text, ref in ref_map.items(): | |
| msg_idx, start_line, end_line = extract_reference_numbers(matched_text) | |
| if msg_idx != -1 and start_line is not None and end_line is not None: | |
| markdown_link = f"[{matched_text}]({ref.url})" | |
| content = content.replace(matched_text, markdown_link) | |
| return content | |
| def parse_message(msg_data: Dict) -> Optional[Message]: | |
| """Parse a single message from JSON data""" | |
| # 过滤系统消息和隐藏消息 | |
| if ( | |
| msg_data.get("author", {}).get("role") == "system" | |
| or msg_data.get("is_visually_hidden_from_conversation", False) | |
| or msg_data.get("metadata", {}).get("is_visually_hidden_from_conversation", False) | |
| ): | |
| return None | |
| # 提取内容 | |
| content = "" | |
| if "content" in msg_data: | |
| if isinstance(msg_data["content"], dict): | |
| content = "\n".join(msg_data["content"].get("parts", [])) | |
| elif isinstance(msg_data["content"], list): | |
| content = "\n".join(msg_data["content"]) | |
| # 解析参考文献 | |
| references = [] | |
| metadata = msg_data.get("metadata", {}) | |
| content_refs = metadata.get("content_references", []) | |
| for ref in content_refs: | |
| reference = ContentReference( | |
| alt=ref.get("alt"), | |
| attribution=ref.get("attribution", ""), | |
| start_idx=ref.get("start_idx", -1), | |
| end_idx=ref.get("end_idx", -1), | |
| matched_text=ref.get("matched_text", ""), | |
| pub_date=ref.get("pub_date"), | |
| snippet=ref.get("snippet", ""), | |
| title=ref.get("title", ""), | |
| type=ref.get("type", ""), | |
| url=ref.get("url", ""), | |
| ) | |
| references.append(reference) | |
| # 如有参考文献,则将文本中的标记转换为 Markdown 链接 | |
| if references: | |
| content = process_content_references(content, references) | |
| # 移除内容中已有的“参考文献”部分 | |
| # 正则限定以 "## 参考文献" 开头的部分移除,直至下一个一级或二级标题或文本结尾 | |
| content = re.sub(r"(?ms)^\s*## 参考文献\s*\n.*?(?=^\s*##|\Z)", "", content) | |
| # 处理 create_time,避免 None 值导致日期格式化异常 | |
| create_time = msg_data.get("create_time") | |
| if create_time is None: | |
| create_time = 0 | |
| return Message( | |
| id=msg_data.get("id", ""), | |
| create_time=create_time, | |
| role=msg_data.get("author", {}).get("role", "unknown"), | |
| content=content, | |
| references=references, | |
| ) | |
| def format_references(references: List[ContentReference]) -> str: | |
| """Format references into markdown list with sorting and deduplication""" | |
| # 按照参考文献标记中的编号去重 | |
| ref_by_idx = {} | |
| for ref in references: | |
| msg_idx, _, _ = extract_reference_numbers(ref.matched_text) | |
| if msg_idx != -1 and msg_idx not in ref_by_idx: | |
| ref_by_idx[msg_idx] = ref | |
| formatted = [] | |
| for idx in sorted(ref_by_idx.keys()): | |
| ref = ref_by_idx[idx] | |
| entry = f"【{idx}】[{ref.title}]({ref.clean_url})" | |
| if ref.attribution: | |
| entry += f" ({ref.attribution})" | |
| formatted.append(entry) | |
| return "\n".join(formatted) | |
| def convert_json_to_markdown(json_data: Dict) -> str: | |
| """Convert JSON chat data to markdown format""" | |
| conversation_id = json_data.get("conversation_id", "Unknown") | |
| messages = [] | |
| all_references = [] | |
| # 解析所有消息 | |
| for msg_data in json_data.get("messages", []): | |
| message = parse_message(msg_data) | |
| if message: | |
| messages.append(message) | |
| all_references.extend(message.references) | |
| # 对消息按照创建时间排序(注意:不存在时默认为 0) | |
| messages.sort(key=lambda m: m.create_time) | |
| md_lines = [ | |
| f"# 对话记录", | |
| f"对话ID: {conversation_id}\n", | |
| ] | |
| # 根据不同角色生成对应的 Markdown 标题 | |
| for msg in messages: | |
| if msg.role == "assistant": | |
| role_title = "Assistant" | |
| elif msg.role == "user": | |
| role_title = "User" | |
| elif msg.role == "tool": | |
| role_title = "Tool" | |
| else: | |
| role_title = msg.role.capitalize() | |
| md_lines.extend([f"## {role_title}", f"*时间:{msg.formatted_time}*\n", msg.content, "\n---\n"]) | |
| # 如果最后多余分隔符则移除 | |
| if md_lines and md_lines[-1].strip() == "---": | |
| md_lines.pop() | |
| # 添加参考文献部分(如果有) | |
| if all_references: | |
| md_lines.extend(["\n## 参考文献", format_references(all_references)]) | |
| return "\n".join(md_lines) | |
| def main(): | |
| import argparse | |
| parser = argparse.ArgumentParser(description="Convert JSON chat to Markdown") | |
| parser.add_argument("input_file", help="Input JSON file path") | |
| parser.add_argument("output_file", help="Output Markdown file path") | |
| args = parser.parse_args() | |
| # 读取 JSON 文件 | |
| with open(args.input_file, "r", encoding="utf-8") as f: | |
| json_data = json.load(f) | |
| # 转换为 Markdown 格式 | |
| markdown_content = convert_json_to_markdown(json_data) | |
| # 写入 Markdown 文件 | |
| with open(args.output_file, "w", encoding="utf-8") as f: | |
| f.write(markdown_content) | |
| print(f"Successfully converted {args.input_file} to {args.output_file}") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment