Skip to content

Instantly share code, notes, and snippets.

@giladbarnea
Last active June 9, 2025 18:44
Show Gist options
  • Select an option

  • Save giladbarnea/5aebdadfaf6ebb7a26046a47bc75d4ca to your computer and use it in GitHub Desktop.

Select an option

Save giladbarnea/5aebdadfaf6ebb7a26046a47bc75d4ca to your computer and use it in GitHub Desktop.
parse-theinternetarchive-ocr-pages.py | takes a text and a page info json files and prints tag-separated pages
# Example: https://archive.org/details/style-lessons-in-clarity-and-grace-12th-edition-pdf-ebook
import json, sys
page_ranges, text = json.load(open(sys.argv[2])), open(sys.argv[1]).read()
chapters = []
current_chapter_name = None
current_chapter_pages = []
chapter_number = 0
for page_number, page_range in enumerate(page_ranges):
page_text = text[page_range[0] : page_range[1]]
page_lines = page_text.split("\n")
chapter_name = page_lines[0]
if chapter_name != current_chapter_name:
# New chapter detected - save previous chapter if it exists
if current_chapter_name is not None:
chapter_content = "\n\n".join(current_chapter_pages)
chapters.append(f"<Chapter {chapter_number}: {current_chapter_name!r}>\n{chapter_content}\n</Chapter {chapter_number}: {current_chapter_name!r}>")
# Start new chapter
chapter_number += 1
current_chapter_name = chapter_name
current_chapter_pages = []
# Add current page to the current chapter (excluding chapter name from content)
page_content_without_chapter = "\n".join(page_lines[1:])
page_content = f"<Page {page_number + 1}>\n{page_content_without_chapter}</Page {page_number + 1}>"
current_chapter_pages.append(page_content)
# Last chapter
if current_chapter_name is not None:
chapter_content = "\n\n".join(current_chapter_pages)
chapters.append(f"<Chapter {chapter_number}: {current_chapter_name!r}>\n{chapter_content}\n</Chapter {chapter_number}: {current_chapter_name!r}>")
print("\n\n".join(chapters))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment