Danm72/extract_granola_summaries.py

## extract_granola_summaries.py
#!/usr/bin/env python3
"""
Extract AI-generated summaries from Granola meeting notes cache.

Granola stores meeting data in a local cache file. The AI-generated summaries
are stored in documentPanels (NOT in documents.notes_*). This script extracts
those summaries and exports them as markdown files with a JSON index.

Usage:
    python extract_granola_summaries.py list              # List all documents with summaries
    python extract_granola_summaries.py extract           # Extract to markdown files
    python extract_granola_summaries.py extract -o ./out  # Specify output directory

Requirements:
    - macOS with Granola installed
    - Python 3.9+

Author: Dan Malone
License: MIT
"""

import argparse
import json
import sys
from datetime import datetime
from pathlib import Path
from typing import Any


# Default paths - modify OUTPUT_DIR as needed for your system
GRANOLA_CACHE_PATH = Path.home() / "Library/Application Support/Granola/cache-v3.json"
DEFAULT_OUTPUT_DIR = Path.home() / "granola-summaries"


def extract_text_from_prosemirror(node: dict | Any) -> str:
    """
    Recursively extract text from ProseMirror document structure.

    Granola uses ProseMirror for rich text storage. This function converts
    the nested document structure into plain markdown-ish text.
    """
    if not isinstance(node, dict):
        return ""

    text_parts = []

    node_type = node.get("type", "")

    if node_type == "text":
        text_parts.append(node.get("text", ""))
    elif node_type == "heading":
        text_parts.append("\n## ")
    elif node_type == "listItem":
        text_parts.append("\n- ")
    elif node_type == "paragraph":
        text_parts.append("\n")

    for child in node.get("content", []):
        text_parts.append(extract_text_from_prosemirror(child))

    return "".join(text_parts)


def load_granola_cache(cache_path: Path) -> dict:
    """
    Load the Granola cache file.

    The cache file has a nested structure where the actual cache data
    is a JSON string inside the top-level 'cache' key.
    """
    if not cache_path.exists():
        print(f"Error: Granola cache not found at {cache_path}", file=sys.stderr)
        print("Make sure Granola is installed and has recorded at least one meeting.", file=sys.stderr)
        sys.exit(1)

    with open(cache_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # The actual cache is a JSON string inside the 'cache' key
    cache_str = data.get("cache", "")
    if isinstance(cache_str, str) and cache_str:
        return json.loads(cache_str)

    return data


def extract_summaries(cache: dict) -> list[dict]:
    """
    Extract AI-generated summaries from documentPanels.

    Returns a list of dicts with:
    - doc_id: Document ID
    - title: Document title
    - created_at: Document creation date (ISO format)
    - summary: Extracted summary text
    """
    state = cache.get("state", {})
    documents = state.get("documents", {})
    document_panels = state.get("documentPanels", {})

    summaries = []

    for doc_id, panels in document_panels.items():
        # Skip empty or invalid doc_ids
        if not doc_id or not doc_id.strip():
            continue

        # Find the Summary panel
        summary_panel = None
        for panel_id, panel in panels.items():
            if panel.get("title") == "Summary":
                summary_panel = panel
                break

        if not summary_panel:
            continue

        # Extract text from ProseMirror content
        content = summary_panel.get("content", {})
        summary_text = extract_text_from_prosemirror(content).strip()

        if not summary_text:
            continue

        # Get document metadata
        doc = documents.get(doc_id, {})
        title = doc.get("title", "Untitled")
        created_at = doc.get("created_at", "")

        summaries.append({
            "doc_id": doc_id,
            "title": title,
            "created_at": created_at,
            "summary": summary_text,
        })

    # Sort by created_at descending (newest first)
    summaries.sort(key=lambda x: x.get("created_at", ""), reverse=True)

    return summaries


def format_date(iso_date: str) -> str:
    """Format ISO date string to human-readable format."""
    if not iso_date:
        return "Unknown date"
    try:
        dt = datetime.fromisoformat(iso_date.replace("Z", "+00:00"))
        return dt.strftime("%Y-%m-%d %H:%M")
    except (ValueError, AttributeError):
        return iso_date


def cmd_list(args: argparse.Namespace) -> None:
    """List all documents with summaries."""
    cache = load_granola_cache(GRANOLA_CACHE_PATH)
    summaries = extract_summaries(cache)

    if not summaries:
        print("No documents with summaries found.")
        return

    print(f"Found {len(summaries)} documents with AI-generated summaries:\n")
    print(f"{'ID':<10} {'DATE':<18} {'TITLE'}")
    print("-" * 80)

    for summary in summaries:
        date_str = format_date(summary["created_at"])
        doc_id_short = summary['doc_id'][:8] + "..."
        title_truncated = summary['title'][:45] + ("..." if len(summary['title']) > 45 else "")
        print(f"{doc_id_short:<10} {date_str:<18} {title_truncated}")

    print(f"\nTotal: {len(summaries)} documents")


def cmd_extract(args: argparse.Namespace) -> None:
    """Extract all summaries to markdown files."""
    cache = load_granola_cache(GRANOLA_CACHE_PATH)
    summaries = extract_summaries(cache)

    if not summaries:
        print("No documents with summaries found.")
        return

    output_dir = Path(args.output) if args.output else DEFAULT_OUTPUT_DIR
    output_dir.mkdir(parents=True, exist_ok=True)

    print(f"Extracting {len(summaries)} summaries to {output_dir}\n")

    # Write individual markdown files
    for summary in summaries:
        doc_id = summary["doc_id"]
        title = summary["title"]
        created_at = summary["created_at"]
        summary_text = summary["summary"]

        date_str = format_date(created_at)

        md_content = f"""# {title}

**Date:** {date_str}
**Document ID:** {doc_id}

---

{summary_text}
"""

        output_path = output_dir / f"{doc_id}.md"
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(md_content)

        print(f"  Wrote: {output_path.name}")

    # Write JSON index
    index_path = output_dir / "index.json"
    with open(index_path, "w", encoding="utf-8") as f:
        json.dump(summaries, f, indent=2, ensure_ascii=False)

    print(f"\n  Wrote index: {index_path}")
    print(f"\nTotal: {len(summaries)} summaries extracted to {output_dir}")


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Extract AI-generated summaries from Granola meeting notes cache",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  %(prog)s list                    List all meetings with summaries
  %(prog)s extract                 Extract to ~/granola-summaries/
  %(prog)s extract -o ./summaries  Extract to custom directory

Note: This script reads from Granola's local cache file. The AI-generated
summaries are in documentPanels, not in documents.notes_* (which contains
user-entered notes only).
        """
    )
    subparsers = parser.add_subparsers(dest="command", help="Available commands")

    # list command
    list_parser = subparsers.add_parser("list", help="List all documents with summaries")
    list_parser.set_defaults(func=cmd_list)

    # extract command
    extract_parser = subparsers.add_parser(
        "extract", help="Extract all summaries to markdown files"
    )
    extract_parser.add_argument(
        "-o", "--output",
        help=f"Output directory (default: {DEFAULT_OUTPUT_DIR})"
    )
    extract_parser.set_defaults(func=cmd_extract)

    args = parser.parse_args()

    if not args.command:
        parser.print_help()
        sys.exit(1)

    args.func(args)


if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	Extract AI-generated summaries from Granola meeting notes cache.

	Granola stores meeting data in a local cache file. The AI-generated summaries
	are stored in documentPanels (NOT in documents.notes_*). This script extracts
	those summaries and exports them as markdown files with a JSON index.

	Usage:
	python extract_granola_summaries.py list # List all documents with summaries
	python extract_granola_summaries.py extract # Extract to markdown files
	python extract_granola_summaries.py extract -o ./out # Specify output directory

	Requirements:
	- macOS with Granola installed
	- Python 3.9+

	Author: Dan Malone
	License: MIT
	"""

	import argparse
	import json
	import sys
	from datetime import datetime
	from pathlib import Path
	from typing import Any


	# Default paths - modify OUTPUT_DIR as needed for your system
	GRANOLA_CACHE_PATH = Path.home() / "Library/Application Support/Granola/cache-v3.json"
	DEFAULT_OUTPUT_DIR = Path.home() / "granola-summaries"


	def extract_text_from_prosemirror(node: dict \| Any) -> str:
	"""
	Recursively extract text from ProseMirror document structure.

	Granola uses ProseMirror for rich text storage. This function converts
	the nested document structure into plain markdown-ish text.
	"""
	if not isinstance(node, dict):
	return ""

	text_parts = []

	node_type = node.get("type", "")

	if node_type == "text":
	text_parts.append(node.get("text", ""))
	elif node_type == "heading":
	text_parts.append("\n## ")
	elif node_type == "listItem":
	text_parts.append("\n- ")
	elif node_type == "paragraph":
	text_parts.append("\n")

	for child in node.get("content", []):
	text_parts.append(extract_text_from_prosemirror(child))

	return "".join(text_parts)


	def load_granola_cache(cache_path: Path) -> dict:
	"""
	Load the Granola cache file.

	The cache file has a nested structure where the actual cache data
	is a JSON string inside the top-level 'cache' key.
	"""
	if not cache_path.exists():
	print(f"Error: Granola cache not found at {cache_path}", file=sys.stderr)
	print("Make sure Granola is installed and has recorded at least one meeting.", file=sys.stderr)
	sys.exit(1)

	with open(cache_path, "r", encoding="utf-8") as f:
	data = json.load(f)

	# The actual cache is a JSON string inside the 'cache' key
	cache_str = data.get("cache", "")
	if isinstance(cache_str, str) and cache_str:
	return json.loads(cache_str)

	return data


	def extract_summaries(cache: dict) -> list[dict]:
	"""
	Extract AI-generated summaries from documentPanels.

	Returns a list of dicts with:
	- doc_id: Document ID
	- title: Document title
	- created_at: Document creation date (ISO format)
	- summary: Extracted summary text
	"""
	state = cache.get("state", {})
	documents = state.get("documents", {})
	document_panels = state.get("documentPanels", {})

	summaries = []

	for doc_id, panels in document_panels.items():
	# Skip empty or invalid doc_ids
	if not doc_id or not doc_id.strip():
	continue

	# Find the Summary panel
	summary_panel = None
	for panel_id, panel in panels.items():
	if panel.get("title") == "Summary":
	summary_panel = panel
	break

	if not summary_panel:
	continue

	# Extract text from ProseMirror content
	content = summary_panel.get("content", {})
	summary_text = extract_text_from_prosemirror(content).strip()

	if not summary_text:
	continue

	# Get document metadata
	doc = documents.get(doc_id, {})
	title = doc.get("title", "Untitled")
	created_at = doc.get("created_at", "")

	summaries.append({
	"doc_id": doc_id,
	"title": title,
	"created_at": created_at,
	"summary": summary_text,
	})

	# Sort by created_at descending (newest first)
	summaries.sort(key=lambda x: x.get("created_at", ""), reverse=True)

	return summaries


	def format_date(iso_date: str) -> str:
	"""Format ISO date string to human-readable format."""
	if not iso_date:
	return "Unknown date"
	try:
	dt = datetime.fromisoformat(iso_date.replace("Z", "+00:00"))
	return dt.strftime("%Y-%m-%d %H:%M")
	except (ValueError, AttributeError):
	return iso_date


	def cmd_list(args: argparse.Namespace) -> None:
	"""List all documents with summaries."""
	cache = load_granola_cache(GRANOLA_CACHE_PATH)
	summaries = extract_summaries(cache)

	if not summaries:
	print("No documents with summaries found.")
	return

	print(f"Found {len(summaries)} documents with AI-generated summaries:\n")
	print(f"{'ID':<10} {'DATE':<18} {'TITLE'}")
	print("-" * 80)

	for summary in summaries:
	date_str = format_date(summary["created_at"])
	doc_id_short = summary['doc_id'][:8] + "..."
	title_truncated = summary['title'][:45] + ("..." if len(summary['title']) > 45 else "")
	print(f"{doc_id_short:<10} {date_str:<18} {title_truncated}")

	print(f"\nTotal: {len(summaries)} documents")


	def cmd_extract(args: argparse.Namespace) -> None:
	"""Extract all summaries to markdown files."""
	cache = load_granola_cache(GRANOLA_CACHE_PATH)
	summaries = extract_summaries(cache)

	if not summaries:
	print("No documents with summaries found.")
	return

	output_dir = Path(args.output) if args.output else DEFAULT_OUTPUT_DIR
	output_dir.mkdir(parents=True, exist_ok=True)

	print(f"Extracting {len(summaries)} summaries to {output_dir}\n")

	# Write individual markdown files
	for summary in summaries:
	doc_id = summary["doc_id"]
	title = summary["title"]
	created_at = summary["created_at"]
	summary_text = summary["summary"]

	date_str = format_date(created_at)

	md_content = f"""# {title}

	Date: {date_str}
	Document ID: {doc_id}

	---

	{summary_text}
	"""

	output_path = output_dir / f"{doc_id}.md"
	with open(output_path, "w", encoding="utf-8") as f:
	f.write(md_content)

	print(f" Wrote: {output_path.name}")

	# Write JSON index
	index_path = output_dir / "index.json"
	with open(index_path, "w", encoding="utf-8") as f:
	json.dump(summaries, f, indent=2, ensure_ascii=False)

	print(f"\n Wrote index: {index_path}")
	print(f"\nTotal: {len(summaries)} summaries extracted to {output_dir}")


	def main() -> None:
	parser = argparse.ArgumentParser(
	description="Extract AI-generated summaries from Granola meeting notes cache",
	formatter_class=argparse.RawDescriptionHelpFormatter,
	epilog="""
	Examples:
	%(prog)s list List all meetings with summaries
	%(prog)s extract Extract to ~/granola-summaries/
	%(prog)s extract -o ./summaries Extract to custom directory

	Note: This script reads from Granola's local cache file. The AI-generated
	summaries are in documentPanels, not in documents.notes_* (which contains
	user-entered notes only).
	"""
	)
	subparsers = parser.add_subparsers(dest="command", help="Available commands")

	# list command
	list_parser = subparsers.add_parser("list", help="List all documents with summaries")
	list_parser.set_defaults(func=cmd_list)

	# extract command
	extract_parser = subparsers.add_parser(
	"extract", help="Extract all summaries to markdown files"
	)
	extract_parser.add_argument(
	"-o", "--output",
	help=f"Output directory (default: {DEFAULT_OUTPUT_DIR})"
	)
	extract_parser.set_defaults(func=cmd_extract)

	args = parser.parse_args()

	if not args.command:
	parser.print_help()
	sys.exit(1)

	args.func(args)


	if __name__ == "__main__":
	main()
No results found