Last active
October 27, 2025 00:39
-
-
Save VRichardJP/9efb97c42718d9d15016986f88e826ee to your computer and use it in GitHub Desktop.
Convert Critical Role transcripts from https://www.kryogenix.org/crsearch/html/index.html to epub using pandoc
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| # Simple script which converts all Critical Role transcripts from <https://www.kryogenix.org/crsearch/html/index.html> to text format. Text transcripts may then easily be converted to ebook formats using pandoc. | |
| # | |
| # How to use: | |
| # | |
| # 1. Download and extract all Critical Role transcripts from <https://www.kryogenix.org/crsearch/cr_full.zip>. | |
| # 2. Run this script from the `cr_full/` directory (or change value of `INDEX` below) | |
| # 3. For each campaign, a new text file containing all episode transcripts will be saved under the `txt/` directory | |
| # 4. You may then easily convert each text to e.g. an ebook using pandoc. For example: `pandoc txt/c1.txt -o c1.epub` | |
| # | |
| # Happy reading :-) | |
| from html.parser import HTMLParser | |
| from pathlib import Path | |
| from dataclasses import dataclass, field | |
| INDEX = "html/index.html" | |
| @dataclass(kw_only=True) | |
| class DialogData: | |
| speaker: str = "" | |
| lines: list[str] = field(default_factory=list) | |
| def format(self) -> str: | |
| return f"**{self.speaker}**\n{'\n'.join(self.lines)}\n" | |
| @dataclass(kw_only=True) | |
| class EpisodeData: | |
| title: str = "" | |
| html_path: str = "" | |
| dialogs: list[DialogData] = field(default_factory=list) | |
| def format(self) -> str: | |
| return f"# {self.title}\n\n{'\n'.join([dialog.format() for dialog in self.dialogs])}\n" | |
| @dataclass(kw_only=True) | |
| class CampaignData: | |
| id: str = "" | |
| title: str = "" | |
| episodes: list[EpisodeData] = field(default_factory=list) | |
| def format(self) -> str: | |
| return f"% {self.title}\n\n{'\n'.join([episode.format() for episode in self.episodes])}\n" | |
| class CRIndexParser(HTMLParser): | |
| def __init__(self): | |
| super().__init__() | |
| self.in_main: bool = False | |
| self.in_campaign_title: bool = False | |
| self.in_episode_list: bool = False | |
| self.in_episode_title: bool = False | |
| self.in_episode_link: bool = False | |
| self.campaigns: list[CampaignData] = [] | |
| def _handle_tag( | |
| self, tag: str, attrs: list[tuple[str, str]] | None, entering: bool | |
| ): | |
| if tag == "main": | |
| self.in_main = entering | |
| return | |
| if not self.in_main: | |
| return | |
| # Campaign title | |
| if tag == "h3": | |
| self.in_campaign_title = entering | |
| if entering: | |
| # Start processing new campaign | |
| self.campaigns.append(CampaignData()) | |
| assert attrs[0][0] == "id" | |
| self.campaigns[-1].id = attrs[0][1] | |
| return | |
| # List of episodes | |
| if tag == "ul": | |
| self.in_episode_list = entering | |
| if not entering: | |
| # Finished processing campaign. Reorder episodes | |
| self.campaigns[-1].episodes.reverse() | |
| return | |
| if not self.in_episode_list: | |
| return | |
| # Episode entry | |
| if tag == "a": | |
| self.in_episode_title = entering | |
| if entering: | |
| # Start processing new episode | |
| self.campaigns[-1].episodes.append(EpisodeData()) | |
| assert attrs[0][0] == "href" | |
| self.campaigns[-1].episodes[-1].html_path = attrs[0][1] | |
| return | |
| def handle_starttag(self, tag: str, attrs: list[tuple[str, str]]): | |
| self._handle_tag(tag, attrs, True) | |
| def handle_endtag(self, tag: str): | |
| self._handle_tag(tag, None, False) | |
| def handle_data(self, data: str): | |
| if self.in_campaign_title: | |
| self.campaigns[-1].title += data | |
| if self.in_episode_title: | |
| self.campaigns[-1].episodes[-1].title += data | |
| class CREpisodeParser(HTMLParser): | |
| def __init__(self): | |
| super().__init__() | |
| self.in_main: bool = False | |
| self.in_dt: bool = False | |
| self.in_speaker: bool = False | |
| self.in_line: bool = False | |
| self.in_stamped_link: bool = False | |
| self.dialogs: list[DialogData] = [] | |
| def _handle_tag( | |
| self, tag: str, attrs: list[tuple[str, str]] | None, entering: bool | |
| ): | |
| if tag == "main": | |
| self.in_main = entering | |
| return | |
| if not self.in_main: | |
| return | |
| if tag == "dt": | |
| self.in_dt = entering | |
| return | |
| # Speaker name | |
| if self.in_dt and tag == "strong": | |
| self.in_speaker = entering | |
| if entering: | |
| self.dialogs.append(DialogData()) | |
| return | |
| # Dialog line | |
| if tag == "dd": | |
| self.in_line = entering | |
| if entering: | |
| self.dialogs[-1].lines.append("") | |
| return | |
| if not self.in_line: | |
| return | |
| # in line <a> are all stamped youtube link, which we ignore | |
| if tag == "a": | |
| self.in_stamped_link = entering | |
| return | |
| if self.in_stamped_link: | |
| return | |
| # Other tags found inside dialogs lines are assumed to be formatting and are preserved | |
| assert tag in ["strong", "i"] | |
| self.dialogs[-1].lines[-1] += f"<{tag}>" if entering else f"</{tag}>" | |
| def handle_starttag(self, tag: str, attrs: list[tuple[str, str]]): | |
| self._handle_tag(tag, attrs, True) | |
| def handle_endtag(self, tag: str): | |
| self._handle_tag(tag, None, False) | |
| def handle_data(self, data): | |
| if self.in_speaker: | |
| self.dialogs[-1].speaker += data | |
| if self.in_line and not self.in_stamped_link: | |
| self.dialogs[-1].lines[-1] += data | |
| def main(): | |
| index_path = Path(INDEX) | |
| html_dir = index_path.parent | |
| print(f"Parsing {index_path}...") | |
| with index_path.open() as f: | |
| p = CRIndexParser() | |
| p.feed(f.read()) | |
| campaigns = p.campaigns | |
| print() | |
| for campaign in campaigns: | |
| print(f"Compiling campaign: {campaign.title}...") | |
| # Parse individual episodes | |
| for episode in campaign.episodes: | |
| episode_path = html_dir / episode.html_path | |
| print(f"Parsing {episode_path}...") | |
| with episode_path.open() as f: | |
| p = CREpisodeParser() | |
| p.feed(f.read()) | |
| episode.dialogs = p.dialogs | |
| book_txt = campaign.format() | |
| book_path = Path("txt") / f"{campaign.id}.txt" | |
| book_path.parent.mkdir(exist_ok=True) | |
| with book_path.open("w+") as f: | |
| f.write(book_txt) | |
| print(f"Saved book text to: {book_path}") | |
| print() | |
| print( | |
| "To convert each preprocessed transcript to epub format individually using pandoc, run the following commands:" | |
| ) | |
| print(" mkdir ebook/") | |
| for campaign in campaigns: | |
| book_path = Path("txt") / f"{campaign.id}.txt" | |
| epub_path = Path("ebook") / f"{campaign.id}.epub" | |
| print(f" pandoc -o {epub_path} {book_path}") | |
| print() | |
| print("Alternatively, you may also use this one-liner bash command:") | |
| print( | |
| ' mkdir ebook; for p in ./txt/*; do fname=$(basename -- "$p"); out="ebook/${fname%.*}.epub"; echo "Converting $p -> $out..."; pandoc -o $out $p; done; echo "Done!"' | |
| ) | |
| print() | |
| print("Happy reading :-)") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment