Skip to content

Instantly share code, notes, and snippets.

@VRichardJP
Last active October 27, 2025 00:39
Show Gist options
  • Select an option

  • Save VRichardJP/9efb97c42718d9d15016986f88e826ee to your computer and use it in GitHub Desktop.

Select an option

Save VRichardJP/9efb97c42718d9d15016986f88e826ee to your computer and use it in GitHub Desktop.
Convert Critical Role transcripts from https://www.kryogenix.org/crsearch/html/index.html to epub using pandoc
#!/usr/bin/env python3
# Simple script which converts all Critical Role transcripts from <https://www.kryogenix.org/crsearch/html/index.html> to text format. Text transcripts may then easily be converted to ebook formats using pandoc.
#
# How to use:
#
# 1. Download and extract all Critical Role transcripts from <https://www.kryogenix.org/crsearch/cr_full.zip>.
# 2. Run this script from the `cr_full/` directory (or change value of `INDEX` below)
# 3. For each campaign, a new text file containing all episode transcripts will be saved under the `txt/` directory
# 4. You may then easily convert each text to e.g. an ebook using pandoc. For example: `pandoc txt/c1.txt -o c1.epub`
#
# Happy reading :-)
from html.parser import HTMLParser
from pathlib import Path
from dataclasses import dataclass, field
INDEX = "html/index.html"
@dataclass(kw_only=True)
class DialogData:
speaker: str = ""
lines: list[str] = field(default_factory=list)
def format(self) -> str:
return f"**{self.speaker}**\n{'\n'.join(self.lines)}\n"
@dataclass(kw_only=True)
class EpisodeData:
title: str = ""
html_path: str = ""
dialogs: list[DialogData] = field(default_factory=list)
def format(self) -> str:
return f"# {self.title}\n\n{'\n'.join([dialog.format() for dialog in self.dialogs])}\n"
@dataclass(kw_only=True)
class CampaignData:
id: str = ""
title: str = ""
episodes: list[EpisodeData] = field(default_factory=list)
def format(self) -> str:
return f"% {self.title}\n\n{'\n'.join([episode.format() for episode in self.episodes])}\n"
class CRIndexParser(HTMLParser):
def __init__(self):
super().__init__()
self.in_main: bool = False
self.in_campaign_title: bool = False
self.in_episode_list: bool = False
self.in_episode_title: bool = False
self.in_episode_link: bool = False
self.campaigns: list[CampaignData] = []
def _handle_tag(
self, tag: str, attrs: list[tuple[str, str]] | None, entering: bool
):
if tag == "main":
self.in_main = entering
return
if not self.in_main:
return
# Campaign title
if tag == "h3":
self.in_campaign_title = entering
if entering:
# Start processing new campaign
self.campaigns.append(CampaignData())
assert attrs[0][0] == "id"
self.campaigns[-1].id = attrs[0][1]
return
# List of episodes
if tag == "ul":
self.in_episode_list = entering
if not entering:
# Finished processing campaign. Reorder episodes
self.campaigns[-1].episodes.reverse()
return
if not self.in_episode_list:
return
# Episode entry
if tag == "a":
self.in_episode_title = entering
if entering:
# Start processing new episode
self.campaigns[-1].episodes.append(EpisodeData())
assert attrs[0][0] == "href"
self.campaigns[-1].episodes[-1].html_path = attrs[0][1]
return
def handle_starttag(self, tag: str, attrs: list[tuple[str, str]]):
self._handle_tag(tag, attrs, True)
def handle_endtag(self, tag: str):
self._handle_tag(tag, None, False)
def handle_data(self, data: str):
if self.in_campaign_title:
self.campaigns[-1].title += data
if self.in_episode_title:
self.campaigns[-1].episodes[-1].title += data
class CREpisodeParser(HTMLParser):
def __init__(self):
super().__init__()
self.in_main: bool = False
self.in_dt: bool = False
self.in_speaker: bool = False
self.in_line: bool = False
self.in_stamped_link: bool = False
self.dialogs: list[DialogData] = []
def _handle_tag(
self, tag: str, attrs: list[tuple[str, str]] | None, entering: bool
):
if tag == "main":
self.in_main = entering
return
if not self.in_main:
return
if tag == "dt":
self.in_dt = entering
return
# Speaker name
if self.in_dt and tag == "strong":
self.in_speaker = entering
if entering:
self.dialogs.append(DialogData())
return
# Dialog line
if tag == "dd":
self.in_line = entering
if entering:
self.dialogs[-1].lines.append("")
return
if not self.in_line:
return
# in line <a> are all stamped youtube link, which we ignore
if tag == "a":
self.in_stamped_link = entering
return
if self.in_stamped_link:
return
# Other tags found inside dialogs lines are assumed to be formatting and are preserved
assert tag in ["strong", "i"]
self.dialogs[-1].lines[-1] += f"<{tag}>" if entering else f"</{tag}>"
def handle_starttag(self, tag: str, attrs: list[tuple[str, str]]):
self._handle_tag(tag, attrs, True)
def handle_endtag(self, tag: str):
self._handle_tag(tag, None, False)
def handle_data(self, data):
if self.in_speaker:
self.dialogs[-1].speaker += data
if self.in_line and not self.in_stamped_link:
self.dialogs[-1].lines[-1] += data
def main():
index_path = Path(INDEX)
html_dir = index_path.parent
print(f"Parsing {index_path}...")
with index_path.open() as f:
p = CRIndexParser()
p.feed(f.read())
campaigns = p.campaigns
print()
for campaign in campaigns:
print(f"Compiling campaign: {campaign.title}...")
# Parse individual episodes
for episode in campaign.episodes:
episode_path = html_dir / episode.html_path
print(f"Parsing {episode_path}...")
with episode_path.open() as f:
p = CREpisodeParser()
p.feed(f.read())
episode.dialogs = p.dialogs
book_txt = campaign.format()
book_path = Path("txt") / f"{campaign.id}.txt"
book_path.parent.mkdir(exist_ok=True)
with book_path.open("w+") as f:
f.write(book_txt)
print(f"Saved book text to: {book_path}")
print()
print(
"To convert each preprocessed transcript to epub format individually using pandoc, run the following commands:"
)
print(" mkdir ebook/")
for campaign in campaigns:
book_path = Path("txt") / f"{campaign.id}.txt"
epub_path = Path("ebook") / f"{campaign.id}.epub"
print(f" pandoc -o {epub_path} {book_path}")
print()
print("Alternatively, you may also use this one-liner bash command:")
print(
' mkdir ebook; for p in ./txt/*; do fname=$(basename -- "$p"); out="ebook/${fname%.*}.epub"; echo "Converting $p -> $out..."; pandoc -o $out $p; done; echo "Done!"'
)
print()
print("Happy reading :-)")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment