Created
September 13, 2020 11:39
-
-
Save ysard/d3bba9463ac2614ecceb5d85746e9bfb to your computer and use it in GitHub Desktop.
Dump and rebuild books from Calameo website
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| URL="https://p.calameoassets.com/200424144000-11e93cd5206820f83e75e58a9b1e4652/p%page_number%.svgz" | |
| function get_images { | |
| # download svg files | |
| for i in {1..196} | |
| do | |
| echo "page $i..." | |
| VAR="${URL/\%page_number\%/$i}" | |
| # Or printf -v VAR "$URL" $i | |
| # with as placeholder "%s" | |
| wget --quiet --user-agent="Mozilla/5.0 (X11; Linux x86_64; rv:80.0) Gecko/20100101 Firefox/80.0" $VAR --directory-prefix="download/" | |
| sleep 0.5 | |
| done | |
| } | |
| function merge_images { | |
| # convert svg files into pdfs | |
| for i in *.svgz | |
| do | |
| inkscape $i --export-pdf $i.pdf | |
| done | |
| # merge pdfs (sort their numbers by natural order) | |
| pdfunite $(ls -v *.pdf) output.pdf | |
| } | |
| mkdir -p download | |
| get_images | |
| cd download | |
| # Extract fonts | |
| python3 extract_fonts.py | |
| # Install fonts | |
| mkdir -p ~/.fonts/custom/ | |
| cp *.woff ~/.fonts/custom/ && fc-cache | |
| # Make book | |
| merge_images | |
| # Purge fonts | |
| rm -rf ~/.fonts/custom/ && fc-cache |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| # coding: utf-8 | |
| # Standard imports | |
| from lxml import etree | |
| import re | |
| import base64 | |
| import glob | |
| from collections import defaultdict | |
| # Custom imports | |
| import cssutils | |
| def write_font(content, title): | |
| """Dump binary font into woff font file""" | |
| with open(title + ".woff", "wb+") as f_d: | |
| f_d.write(base64.b64decode(content)) | |
| def extract_fonts(svg_file, font_db): | |
| """Extract fonts stored in base64 in the stylesheet of the given svg file""" | |
| tree = etree.parse(svg_file) | |
| root = tree.getroot() | |
| ns = {'s': 'http://www.w3.org/2000/svg'} | |
| # Get only the stylesheet (1 per document) | |
| nodes = root.xpath('//s:style', namespaces=ns) | |
| assert len(nodes) == 1 | |
| style_node = nodes[0] | |
| #print(style_node.text) | |
| #print(style_node.tag) | |
| # Parse stylesheet | |
| sheet = cssutils.parseString(style_node.text) | |
| g = (rule for rule in sheet if isinstance(rule, cssutils.css.CSSFontFaceRule)) | |
| for rule in g: | |
| #print(rule.style.cssText) | |
| #print(rule.style.src) | |
| font_family = rule.style.fontFamily.replace("\"", "") | |
| # Get the src attribute and extract font in base64 | |
| for property in rule.style: | |
| #print(property.name) | |
| #print(property.value) | |
| if property.name != "src": | |
| continue | |
| # url(data:application/font-woff;charset=utf-8;base64,XXX) format("woff") | |
| m = re.search('base64(.*)"\) format\("woff"\)', property.value) | |
| if m: | |
| # base64 code found => dump it | |
| # PS: 1 url per CSS rule in practical... | |
| #print(m.group(1)) | |
| write_font(m.group(1), font_family) | |
| # Keep the font and its name in memory (test) | |
| # font_db[m.group(1)].add(font_family) | |
| def replace_private_chars(svgz_file): | |
| """Replace "" (not in fonts) by non-breaking character in the given svg file""" | |
| with open(svgz_file, "r+", encoding="utf8") as f_d: | |
| data = f_d.read().replace(u"\ue01f", u"\u00a0") | |
| #data = f_d.read().replace(u"\u00a0", " ") | |
| f_d.seek(0) | |
| f_d.write(data) | |
| font_database = defaultdict(set) | |
| # test | |
| #extract_fonts("p126.svg", font_database) | |
| #replace_private_chars("p194.svgz") | |
| for svgz_file in glob.glob("*.svgz"): | |
| print(svgz_file) | |
| extract_fonts(svgz_file, font_database) | |
| replace_private_chars(svgz_file) | |
| #print(font_database.values()) | |
| #print(len(font_database)) | |
| #for subset in font_database.values(): | |
| #assert len(subset) == 1 | |
| #=> no font duplication... | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment