VRichardJP/cr2book.py

## cr2book.py
#!/usr/bin/env python3

# Simple script which converts all Critical Role transcripts from <https://www.kryogenix.org/crsearch/html/index.html> to text format. Text transcripts may then easily be converted to ebook formats using pandoc.
#
# How to use:
#
# 1. Download and extract all Critical Role transcripts from <https://www.kryogenix.org/crsearch/cr_full.zip>.
# 2. Run this script from the `cr_full/` directory (or change value of `INDEX` below)
# 3. For each campaign, a new text file containing all episode transcripts will be saved under the `txt/` directory
# 4. You may then easily convert each text to e.g. an ebook using pandoc. For example: `pandoc txt/c1.txt -o c1.epub`
#
# Happy reading :-)

from html.parser import HTMLParser
from pathlib import Path
from dataclasses import dataclass, field

INDEX = "html/index.html"


@dataclass(kw_only=True)
class DialogData:
    speaker: str = ""
    lines: list[str] = field(default_factory=list)

    def format(self) -> str:
        return f"**{self.speaker}**\n{'\n'.join(self.lines)}\n"


@dataclass(kw_only=True)
class EpisodeData:
    title: str = ""
    html_path: str = ""
    dialogs: list[DialogData] = field(default_factory=list)

    def format(self) -> str:
        return f"# {self.title}\n\n{'\n'.join([dialog.format() for dialog in self.dialogs])}\n"


@dataclass(kw_only=True)
class CampaignData:
    id: str = ""
    title: str = ""
    episodes: list[EpisodeData] = field(default_factory=list)

    def format(self) -> str:
        return f"% {self.title}\n\n{'\n'.join([episode.format() for episode in self.episodes])}\n"


class CRIndexParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.in_main: bool = False
        self.in_campaign_title: bool = False
        self.in_episode_list: bool = False
        self.in_episode_title: bool = False
        self.in_episode_link: bool = False
        self.campaigns: list[CampaignData] = []

    def _handle_tag(
        self, tag: str, attrs: list[tuple[str, str]] | None, entering: bool
    ):
        if tag == "main":
            self.in_main = entering
            return

        if not self.in_main:
            return

        # Campaign title
        if tag == "h3":
            self.in_campaign_title = entering
            if entering:
                # Start processing new campaign
                self.campaigns.append(CampaignData())
                assert attrs[0][0] == "id"
                self.campaigns[-1].id = attrs[0][1]
            return

        # List of episodes
        if tag == "ul":
            self.in_episode_list = entering
            if not entering:
                # Finished processing campaign. Reorder episodes
                self.campaigns[-1].episodes.reverse()
            return

        if not self.in_episode_list:
            return

        # Episode entry
        if tag == "a":
            self.in_episode_title = entering
            if entering:
                # Start processing new episode
                self.campaigns[-1].episodes.append(EpisodeData())
                assert attrs[0][0] == "href"
                self.campaigns[-1].episodes[-1].html_path = attrs[0][1]
            return

    def handle_starttag(self, tag: str, attrs: list[tuple[str, str]]):
        self._handle_tag(tag, attrs, True)

    def handle_endtag(self, tag: str):
        self._handle_tag(tag, None, False)

    def handle_data(self, data: str):
        if self.in_campaign_title:
            self.campaigns[-1].title += data
        if self.in_episode_title:
            self.campaigns[-1].episodes[-1].title += data


class CREpisodeParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.in_main: bool = False
        self.in_dt: bool = False
        self.in_speaker: bool = False
        self.in_line: bool = False
        self.in_stamped_link: bool = False
        self.dialogs: list[DialogData] = []

    def _handle_tag(
        self, tag: str, attrs: list[tuple[str, str]] | None, entering: bool
    ):
        if tag == "main":
            self.in_main = entering
            return

        if not self.in_main:
            return

        if tag == "dt":
            self.in_dt = entering
            return

        # Speaker name
        if self.in_dt and tag == "strong":
            self.in_speaker = entering
            if entering:
                self.dialogs.append(DialogData())
            return

        # Dialog line
        if tag == "dd":
            self.in_line = entering
            if entering:
                self.dialogs[-1].lines.append("")
            return

        if not self.in_line:
            return

        # in line <a> are all stamped youtube link, which we ignore
        if tag == "a":
            self.in_stamped_link = entering
            return

        if self.in_stamped_link:
            return

        # Other tags found inside dialogs lines are assumed to be formatting and are preserved
        assert tag in ["strong", "i"]
        self.dialogs[-1].lines[-1] += f"<{tag}>" if entering else f"</{tag}>"

    def handle_starttag(self, tag: str, attrs: list[tuple[str, str]]):
        self._handle_tag(tag, attrs, True)

    def handle_endtag(self, tag: str):
        self._handle_tag(tag, None, False)

    def handle_data(self, data):
        if self.in_speaker:
            self.dialogs[-1].speaker += data
        if self.in_line and not self.in_stamped_link:
            self.dialogs[-1].lines[-1] += data


def main():
    index_path = Path(INDEX)
    html_dir = index_path.parent

    print(f"Parsing {index_path}...")
    with index_path.open() as f:
        p = CRIndexParser()
        p.feed(f.read())
        campaigns = p.campaigns
    print()

    for campaign in campaigns:
        print(f"Compiling campaign: {campaign.title}...")

        # Parse individual episodes
        for episode in campaign.episodes:
            episode_path = html_dir / episode.html_path
            print(f"Parsing {episode_path}...")
            with episode_path.open() as f:
                p = CREpisodeParser()
                p.feed(f.read())
                episode.dialogs = p.dialogs

        book_txt = campaign.format()

        book_path = Path("txt") / f"{campaign.id}.txt"
        book_path.parent.mkdir(exist_ok=True)

        with book_path.open("w+") as f:
            f.write(book_txt)

        print(f"Saved book text to: {book_path}")
        print()

    print(
        "To convert each preprocessed transcript to epub format individually using pandoc, run the following commands:"
    )
    print("    mkdir ebook/")
    for campaign in campaigns:
        book_path = Path("txt") / f"{campaign.id}.txt"
        epub_path = Path("ebook") / f"{campaign.id}.epub"
        print(f"    pandoc -o {epub_path} {book_path}")
    print()

    print("Alternatively, you may also use this one-liner bash command:")
    print(
        '    mkdir ebook; for p in ./txt/*; do fname=$(basename -- "$p"); out="ebook/${fname%.*}.epub"; echo "Converting $p -> $out..."; pandoc -o $out $p; done; echo "Done!"'
    )
    print()

    print("Happy reading :-)")


if __name__ == "__main__":
    main()
	#!/usr/bin/env python3

	# Simple script which converts all Critical Role transcripts from <https://www.kryogenix.org/crsearch/html/index.html> to text format. Text transcripts may then easily be converted to ebook formats using pandoc.
	#
	# How to use:
	#
	# 1. Download and extract all Critical Role transcripts from <https://www.kryogenix.org/crsearch/cr_full.zip>.
	# 2. Run this script from the `cr_full/` directory (or change value of `INDEX` below)
	# 3. For each campaign, a new text file containing all episode transcripts will be saved under the `txt/` directory
	# 4. You may then easily convert each text to e.g. an ebook using pandoc. For example: `pandoc txt/c1.txt -o c1.epub`
	#
	# Happy reading :-)

	from html.parser import HTMLParser
	from pathlib import Path
	from dataclasses import dataclass, field

	INDEX = "html/index.html"


	@dataclass(kw_only=True)
	class DialogData:
	speaker: str = ""
	lines: list[str] = field(default_factory=list)

	def format(self) -> str:
	return f"{self.speaker}\n{'\n'.join(self.lines)}\n"


	@dataclass(kw_only=True)
	class EpisodeData:
	title: str = ""
	html_path: str = ""
	dialogs: list[DialogData] = field(default_factory=list)

	def format(self) -> str:
	return f"# {self.title}\n\n{'\n'.join([dialog.format() for dialog in self.dialogs])}\n"


	@dataclass(kw_only=True)
	class CampaignData:
	id: str = ""
	title: str = ""
	episodes: list[EpisodeData] = field(default_factory=list)

	def format(self) -> str:
	return f"% {self.title}\n\n{'\n'.join([episode.format() for episode in self.episodes])}\n"


	class CRIndexParser(HTMLParser):
	def __init__(self):
	super().__init__()
	self.in_main: bool = False
	self.in_campaign_title: bool = False
	self.in_episode_list: bool = False
	self.in_episode_title: bool = False
	self.in_episode_link: bool = False
	self.campaigns: list[CampaignData] = []

	def _handle_tag(
	self, tag: str, attrs: list[tuple[str, str]] \| None, entering: bool
	):
	if tag == "main":
	self.in_main = entering
	return

	if not self.in_main:
	return

	# Campaign title
	if tag == "h3":
	self.in_campaign_title = entering
	if entering:
	# Start processing new campaign
	self.campaigns.append(CampaignData())
	assert attrs[0][0] == "id"
	self.campaigns[-1].id = attrs[0][1]
	return

	# List of episodes
	if tag == "ul":
	self.in_episode_list = entering
	if not entering:
	# Finished processing campaign. Reorder episodes
	self.campaigns[-1].episodes.reverse()
	return

	if not self.in_episode_list:
	return

	# Episode entry
	if tag == "a":
	self.in_episode_title = entering
	if entering:
	# Start processing new episode
	self.campaigns[-1].episodes.append(EpisodeData())
	assert attrs[0][0] == "href"
	self.campaigns[-1].episodes[-1].html_path = attrs[0][1]
	return

	def handle_starttag(self, tag: str, attrs: list[tuple[str, str]]):
	self._handle_tag(tag, attrs, True)

	def handle_endtag(self, tag: str):
	self._handle_tag(tag, None, False)

	def handle_data(self, data: str):
	if self.in_campaign_title:
	self.campaigns[-1].title += data
	if self.in_episode_title:
	self.campaigns[-1].episodes[-1].title += data


	class CREpisodeParser(HTMLParser):
	def __init__(self):
	super().__init__()
	self.in_main: bool = False
	self.in_dt: bool = False
	self.in_speaker: bool = False
	self.in_line: bool = False
	self.in_stamped_link: bool = False
	self.dialogs: list[DialogData] = []

	def _handle_tag(
	self, tag: str, attrs: list[tuple[str, str]] \| None, entering: bool
	):
	if tag == "main":
	self.in_main = entering
	return

	if not self.in_main:
	return

	if tag == "dt":
	self.in_dt = entering
	return

	# Speaker name
	if self.in_dt and tag == "strong":
	self.in_speaker = entering
	if entering:
	self.dialogs.append(DialogData())
	return

	# Dialog line
	if tag == "dd":
	self.in_line = entering
	if entering:
	self.dialogs[-1].lines.append("")
	return

	if not self.in_line:
	return

	# in line <a> are all stamped youtube link, which we ignore
	if tag == "a":
	self.in_stamped_link = entering
	return

	if self.in_stamped_link:
	return

	# Other tags found inside dialogs lines are assumed to be formatting and are preserved
	assert tag in ["strong", "i"]
	self.dialogs[-1].lines[-1] += f"<{tag}>" if entering else f"</{tag}>"

	def handle_starttag(self, tag: str, attrs: list[tuple[str, str]]):
	self._handle_tag(tag, attrs, True)

	def handle_endtag(self, tag: str):
	self._handle_tag(tag, None, False)

	def handle_data(self, data):
	if self.in_speaker:
	self.dialogs[-1].speaker += data
	if self.in_line and not self.in_stamped_link:
	self.dialogs[-1].lines[-1] += data


	def main():
	index_path = Path(INDEX)
	html_dir = index_path.parent

	print(f"Parsing {index_path}...")
	with index_path.open() as f:
	p = CRIndexParser()
	p.feed(f.read())
	campaigns = p.campaigns
	print()

	for campaign in campaigns:
	print(f"Compiling campaign: {campaign.title}...")

	# Parse individual episodes
	for episode in campaign.episodes:
	episode_path = html_dir / episode.html_path
	print(f"Parsing {episode_path}...")
	with episode_path.open() as f:
	p = CREpisodeParser()
	p.feed(f.read())
	episode.dialogs = p.dialogs

	book_txt = campaign.format()

	book_path = Path("txt") / f"{campaign.id}.txt"
	book_path.parent.mkdir(exist_ok=True)

	with book_path.open("w+") as f:
	f.write(book_txt)

	print(f"Saved book text to: {book_path}")
	print()

	print(
	"To convert each preprocessed transcript to epub format individually using pandoc, run the following commands:"
	)
	print(" mkdir ebook/")
	for campaign in campaigns:
	book_path = Path("txt") / f"{campaign.id}.txt"
	epub_path = Path("ebook") / f"{campaign.id}.epub"
	print(f" pandoc -o {epub_path} {book_path}")
	print()

	print("Alternatively, you may also use this one-liner bash command:")
	print(
	' mkdir ebook; for p in ./txt/; do fname=$(basename -- "$p"); out="ebook/${fname%.}.epub"; echo "Converting $p -> $out..."; pandoc -o $out $p; done; echo "Done!"'
	)
	print()

	print("Happy reading :-)")


	if __name__ == "__main__":
	main()
No results found