Skip to content

Instantly share code, notes, and snippets.

@Theigrams
Last active February 12, 2025 05:40
Show Gist options
  • Select an option

  • Save Theigrams/beeec85e6cf6f90b38331e32ee32d979 to your computer and use it in GitHub Desktop.

Select an option

Save Theigrams/beeec85e6cf6f90b38331e32ee32d979 to your computer and use it in GitHub Desktop.
将 DeepResearch Json 转成 markdown
import json
import re
from dataclasses import dataclass
from datetime import datetime
from typing import Dict, List, Optional
from urllib.parse import urlparse, urlunparse
@dataclass
class ContentReference:
"""Content reference data structure"""
alt: Optional[str]
attribution: str
start_idx: int
end_idx: int
matched_text: str
pub_date: Optional[str]
snippet: str
title: str
type: str
url: str
@property
def clean_url(self) -> str:
"""Remove text fragment identifier from URL"""
parsed = urlparse(self.url)
# Remove fragment
cleaned = parsed._replace(fragment="")
return urlunparse(cleaned)
@dataclass
class Message:
"""Message data structure"""
id: str
create_time: float
role: str
content: str
references: List[ContentReference]
is_hidden: bool = False
@property
def formatted_time(self) -> str:
"""Convert Unix timestamp to formatted datetime string"""
try:
return datetime.fromtimestamp(self.create_time).strftime("%Y-%m-%d %H:%M:%S")
except Exception:
return "Unknown time"
def extract_reference_numbers(matched_text: str) -> tuple[int, Optional[int], Optional[int]]:
"""
Extract message index and line numbers from reference format:
【msg_idx†Lstart_line-Lend_line】
"""
pattern = r"【(\d+)†L(\d+)-L(\d+)】"
# 使用 re.search 保证可以匹配到文本中任意位置的参考文献
match = re.search(pattern, matched_text)
if match:
msg_idx = int(match.group(1))
start_line = int(match.group(2))
end_line = int(match.group(3))
return msg_idx, start_line, end_line
return -1, None, None
def process_content_references(content: str, references: List[ContentReference]) -> str:
"""
Process content to convert reference format to markdown links.
将【编号†L起始行-L结束行】替换为 [【编号†L起始行-L结束行】](raw_url)
"""
# 创建一个以 matched_text 为 key 的映射,方便快速替换
ref_map = {ref.matched_text: ref for ref in references}
for matched_text, ref in ref_map.items():
msg_idx, start_line, end_line = extract_reference_numbers(matched_text)
if msg_idx != -1 and start_line is not None and end_line is not None:
markdown_link = f"[{matched_text}]({ref.url})"
content = content.replace(matched_text, markdown_link)
return content
def parse_message(msg_data: Dict) -> Optional[Message]:
"""Parse a single message from JSON data"""
# 过滤系统消息和隐藏消息
if (
msg_data.get("author", {}).get("role") == "system"
or msg_data.get("is_visually_hidden_from_conversation", False)
or msg_data.get("metadata", {}).get("is_visually_hidden_from_conversation", False)
):
return None
# 提取内容
content = ""
if "content" in msg_data:
if isinstance(msg_data["content"], dict):
content = "\n".join(msg_data["content"].get("parts", []))
elif isinstance(msg_data["content"], list):
content = "\n".join(msg_data["content"])
# 解析参考文献
references = []
metadata = msg_data.get("metadata", {})
content_refs = metadata.get("content_references", [])
for ref in content_refs:
reference = ContentReference(
alt=ref.get("alt"),
attribution=ref.get("attribution", ""),
start_idx=ref.get("start_idx", -1),
end_idx=ref.get("end_idx", -1),
matched_text=ref.get("matched_text", ""),
pub_date=ref.get("pub_date"),
snippet=ref.get("snippet", ""),
title=ref.get("title", ""),
type=ref.get("type", ""),
url=ref.get("url", ""),
)
references.append(reference)
# 如有参考文献,则将文本中的标记转换为 Markdown 链接
if references:
content = process_content_references(content, references)
# 移除内容中已有的“参考文献”部分
# 正则限定以 "## 参考文献" 开头的部分移除,直至下一个一级或二级标题或文本结尾
content = re.sub(r"(?ms)^\s*## 参考文献\s*\n.*?(?=^\s*##|\Z)", "", content)
# 处理 create_time,避免 None 值导致日期格式化异常
create_time = msg_data.get("create_time")
if create_time is None:
create_time = 0
return Message(
id=msg_data.get("id", ""),
create_time=create_time,
role=msg_data.get("author", {}).get("role", "unknown"),
content=content,
references=references,
)
def format_references(references: List[ContentReference]) -> str:
"""Format references into markdown list with sorting and deduplication"""
# 按照参考文献标记中的编号去重
ref_by_idx = {}
for ref in references:
msg_idx, _, _ = extract_reference_numbers(ref.matched_text)
if msg_idx != -1 and msg_idx not in ref_by_idx:
ref_by_idx[msg_idx] = ref
formatted = []
for idx in sorted(ref_by_idx.keys()):
ref = ref_by_idx[idx]
entry = f"【{idx}】[{ref.title}]({ref.clean_url})"
if ref.attribution:
entry += f" ({ref.attribution})"
formatted.append(entry)
return "\n".join(formatted)
def convert_json_to_markdown(json_data: Dict) -> str:
"""Convert JSON chat data to markdown format"""
conversation_id = json_data.get("conversation_id", "Unknown")
messages = []
all_references = []
# 解析所有消息
for msg_data in json_data.get("messages", []):
message = parse_message(msg_data)
if message:
messages.append(message)
all_references.extend(message.references)
# 对消息按照创建时间排序(注意:不存在时默认为 0)
messages.sort(key=lambda m: m.create_time)
md_lines = [
f"# 对话记录",
f"对话ID: {conversation_id}\n",
]
# 根据不同角色生成对应的 Markdown 标题
for msg in messages:
if msg.role == "assistant":
role_title = "Assistant"
elif msg.role == "user":
role_title = "User"
elif msg.role == "tool":
role_title = "Tool"
else:
role_title = msg.role.capitalize()
md_lines.extend([f"## {role_title}", f"*时间:{msg.formatted_time}*\n", msg.content, "\n---\n"])
# 如果最后多余分隔符则移除
if md_lines and md_lines[-1].strip() == "---":
md_lines.pop()
# 添加参考文献部分(如果有)
if all_references:
md_lines.extend(["\n## 参考文献", format_references(all_references)])
return "\n".join(md_lines)
def main():
import argparse
parser = argparse.ArgumentParser(description="Convert JSON chat to Markdown")
parser.add_argument("input_file", help="Input JSON file path")
parser.add_argument("output_file", help="Output Markdown file path")
args = parser.parse_args()
# 读取 JSON 文件
with open(args.input_file, "r", encoding="utf-8") as f:
json_data = json.load(f)
# 转换为 Markdown 格式
markdown_content = convert_json_to_markdown(json_data)
# 写入 Markdown 文件
with open(args.output_file, "w", encoding="utf-8") as f:
f.write(markdown_content)
print(f"Successfully converted {args.input_file} to {args.output_file}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment