thefranke/convert_bulletty_to_pdf.py

## convert_bulletty_to_pdf.py
#!/usr/bin/env python3
#
# Convert (flagged) articles in bulletty to PDF
#

import pypandoc
import toml
import frontmatter
import re

from urllib.parse import urljoin, urlparse
from pathlib import Path
from optparse import OptionParser
from appdirs import AppDirs

def replace_relative_urls(text, base_url):
    pattern = r'\[[^\]]*\]\((.*?)\s*("(?:.*[^"])")?\s*\)'
    return re.sub(pattern, lambda match: f'[{match.group(0).split("](")[0][1:]}]({urljoin(base_url, match.group(1))})', text)

def read_md(file_path: Path):
    frontmattered = False
    frontmatter = False

    text = ""

    with open(file_path) as f:
        for l in f:
            line = l.strip()

            if not frontmattered and line == "---":
                if frontmatter:
                    frontmattered = True
                frontmatter = not frontmatter

            if frontmatter:
                line = line.replace(' =', ':')

            text += line + '\n'

    return text

def postprocess_md(text: str):
    fm = frontmatter.loads(text)

    # better formatting
    fm["urlcolor"] = "Maroon"

    # link back to original URL
    fm["subtitle"] = "[%s](%s)" % (fm["url"], fm["url"])

    # fix relative image URLs
    base_url = fm["url"]
    if base_url.endswith('.html'):
        base_url = '/'.join(base_url.split('/')[:-1])
    else:
        base_url = '/'.join(base_url.split('/')[:-1])

    fm.content = replace_relative_urls(fm.content, base_url)

    return frontmatter.dumps(fm)

def make_pdf(file_path: Path, out_path: Path, regenerate = True):
    pdf_path = out_path / Path(file_path.name.replace('.md', '.pdf'))
    if regenerate or not pdf_path.exists():
        text = read_md(file_path)
        text = postprocess_md(text)

        print("Converting %s" % (file_path.name))
        try:
            pypandoc.convert_text(text, 'pdf', format='md', outputfile=pdf_path, extra_args=["--pdf-engine=xelatex"])
        except RuntimeError as e:
            print("Error, skipping")
            print(e)

def convert_cache(cache_path: Path, out_path: Path, flagged_only = True, regenerate = False):
    if flagged_only:
        later = toml.load(bulletty_cache / '.later.toml')
        for f in later['read_later']:
            md_file = cache_path / 'categories' / f
            make_pdf(md_file, out_path, regenerate)
    else:
        for md_file in cache_path.rglob('*.md'):
            make_pdf(md_file, out_path, regenerate)

if __name__ == '__main__':
    parser = OptionParser()

    # Adding boolean options
    parser.add_option("-f", "--flaggedonly", action="store_true", default=True,  help="Only convert read-later flagged articles")
    parser.add_option("-r", "--regenerate",  action="store_true", default=False, help="Force regeneration articles")
    parser.add_option("-o", "--outdir",                                          help="Specify path for PDF output")
    parser.add_option("-c", "--cachedir",                                        help="Specify bulletty cache directory (leave empty to detect automatically)")

    (options, args) = parser.parse_args()

    cache_dir = options.cachedir if options.cachedir else AppDirs("bulletty").user_data_dir
    bulletty_cache = Path(cache_dir)

    if not options.outdir:
        print("Error: Please specify an output directory!\n")
        parser.print_help()
        exit()

    convert_cache(bulletty_cache, options.outdir, options.flaggedonly, options.regenerate)
	#!/usr/bin/env python3
	#
	# Convert (flagged) articles in bulletty to PDF
	#

	import pypandoc
	import toml
	import frontmatter
	import re

	from urllib.parse import urljoin, urlparse
	from pathlib import Path
	from optparse import OptionParser
	from appdirs import AppDirs

	def replace_relative_urls(text, base_url):
	pattern = r'\[[^\]]\]\((.?)\s("(?:.[^"])")?\s*\)'
	return re.sub(pattern, lambda match: f'[{match.group(0).split("](")[0][1:]}]({urljoin(base_url, match.group(1))})', text)

	def read_md(file_path: Path):
	frontmattered = False
	frontmatter = False

	text = ""

	with open(file_path) as f:
	for l in f:
	line = l.strip()

	if not frontmattered and line == "---":
	if frontmatter:
	frontmattered = True
	frontmatter = not frontmatter

	if frontmatter:
	line = line.replace(' =', ':')

	text += line + '\n'

	return text

	def postprocess_md(text: str):
	fm = frontmatter.loads(text)

	# better formatting
	fm["urlcolor"] = "Maroon"

	# link back to original URL
	fm["subtitle"] = "[%s](%s)" % (fm["url"], fm["url"])

	# fix relative image URLs
	base_url = fm["url"]
	if base_url.endswith('.html'):
	base_url = '/'.join(base_url.split('/')[:-1])
	else:
	base_url = '/'.join(base_url.split('/')[:-1])

	fm.content = replace_relative_urls(fm.content, base_url)

	return frontmatter.dumps(fm)

	def make_pdf(file_path: Path, out_path: Path, regenerate = True):
	pdf_path = out_path / Path(file_path.name.replace('.md', '.pdf'))
	if regenerate or not pdf_path.exists():
	text = read_md(file_path)
	text = postprocess_md(text)

	print("Converting %s" % (file_path.name))
	try:
	pypandoc.convert_text(text, 'pdf', format='md', outputfile=pdf_path, extra_args=["--pdf-engine=xelatex"])
	except RuntimeError as e:
	print("Error, skipping")
	print(e)

	def convert_cache(cache_path: Path, out_path: Path, flagged_only = True, regenerate = False):
	if flagged_only:
	later = toml.load(bulletty_cache / '.later.toml')
	for f in later['read_later']:
	md_file = cache_path / 'categories' / f
	make_pdf(md_file, out_path, regenerate)
	else:
	for md_file in cache_path.rglob('*.md'):
	make_pdf(md_file, out_path, regenerate)

	if __name__ == '__main__':
	parser = OptionParser()

	# Adding boolean options
	parser.add_option("-f", "--flaggedonly", action="store_true", default=True, help="Only convert read-later flagged articles")
	parser.add_option("-r", "--regenerate", action="store_true", default=False, help="Force regeneration articles")
	parser.add_option("-o", "--outdir", help="Specify path for PDF output")
	parser.add_option("-c", "--cachedir", help="Specify bulletty cache directory (leave empty to detect automatically)")

	(options, args) = parser.parse_args()

	cache_dir = options.cachedir if options.cachedir else AppDirs("bulletty").user_data_dir
	bulletty_cache = Path(cache_dir)

	if not options.outdir:
	print("Error: Please specify an output directory!\n")
	parser.print_help()
	exit()

	convert_cache(bulletty_cache, options.outdir, options.flaggedonly, options.regenerate)
No results found