Skip to content

Instantly share code, notes, and snippets.

@ysard
Created September 13, 2020 11:39
Show Gist options
  • Select an option

  • Save ysard/d3bba9463ac2614ecceb5d85746e9bfb to your computer and use it in GitHub Desktop.

Select an option

Save ysard/d3bba9463ac2614ecceb5d85746e9bfb to your computer and use it in GitHub Desktop.
Dump and rebuild books from Calameo website
#!/bin/bash
URL="https://p.calameoassets.com/200424144000-11e93cd5206820f83e75e58a9b1e4652/p%page_number%.svgz"
function get_images {
# download svg files
for i in {1..196}
do
echo "page $i..."
VAR="${URL/\%page_number\%/$i}"
# Or printf -v VAR "$URL" $i
# with as placeholder "%s"
wget --quiet --user-agent="Mozilla/5.0 (X11; Linux x86_64; rv:80.0) Gecko/20100101 Firefox/80.0" $VAR --directory-prefix="download/"
sleep 0.5
done
}
function merge_images {
# convert svg files into pdfs
for i in *.svgz
do
inkscape $i --export-pdf $i.pdf
done
# merge pdfs (sort their numbers by natural order)
pdfunite $(ls -v *.pdf) output.pdf
}
mkdir -p download
get_images
cd download
# Extract fonts
python3 extract_fonts.py
# Install fonts
mkdir -p ~/.fonts/custom/
cp *.woff ~/.fonts/custom/ && fc-cache
# Make book
merge_images
# Purge fonts
rm -rf ~/.fonts/custom/ && fc-cache
#!/usr/bin/env python3
# coding: utf-8
# Standard imports
from lxml import etree
import re
import base64
import glob
from collections import defaultdict
# Custom imports
import cssutils
def write_font(content, title):
"""Dump binary font into woff font file"""
with open(title + ".woff", "wb+") as f_d:
f_d.write(base64.b64decode(content))
def extract_fonts(svg_file, font_db):
"""Extract fonts stored in base64 in the stylesheet of the given svg file"""
tree = etree.parse(svg_file)
root = tree.getroot()
ns = {'s': 'http://www.w3.org/2000/svg'}
# Get only the stylesheet (1 per document)
nodes = root.xpath('//s:style', namespaces=ns)
assert len(nodes) == 1
style_node = nodes[0]
#print(style_node.text)
#print(style_node.tag)
# Parse stylesheet
sheet = cssutils.parseString(style_node.text)
g = (rule for rule in sheet if isinstance(rule, cssutils.css.CSSFontFaceRule))
for rule in g:
#print(rule.style.cssText)
#print(rule.style.src)
font_family = rule.style.fontFamily.replace("\"", "")
# Get the src attribute and extract font in base64
for property in rule.style:
#print(property.name)
#print(property.value)
if property.name != "src":
continue
# url(data:application/font-woff;charset=utf-8;base64,XXX) format("woff")
m = re.search('base64(.*)"\) format\("woff"\)', property.value)
if m:
# base64 code found => dump it
# PS: 1 url per CSS rule in practical...
#print(m.group(1))
write_font(m.group(1), font_family)
# Keep the font and its name in memory (test)
# font_db[m.group(1)].add(font_family)
def replace_private_chars(svgz_file):
"""Replace "" (not in fonts) by non-breaking character in the given svg file"""
with open(svgz_file, "r+", encoding="utf8") as f_d:
data = f_d.read().replace(u"\ue01f", u"\u00a0")
#data = f_d.read().replace(u"\u00a0", " ")
f_d.seek(0)
f_d.write(data)
font_database = defaultdict(set)
# test
#extract_fonts("p126.svg", font_database)
#replace_private_chars("p194.svgz")
for svgz_file in glob.glob("*.svgz"):
print(svgz_file)
extract_fonts(svgz_file, font_database)
replace_private_chars(svgz_file)
#print(font_database.values())
#print(len(font_database))
#for subset in font_database.values():
#assert len(subset) == 1
#=> no font duplication...
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment