Get files
$ docker run --rm -v `pwd`:/scratch --workdir /scratch datasci wget --wait 2 --random-wait --recursive --level=0 --page-requisites --convert-links https://treeid.arborday.org/Caveat: if --recursive is used, defaults to 5 levels.
Result:
Total wall clock time: 33m 45s
Downloaded: 943 files, 14M in 1.7s (8.01 MB/s)
rename files:
$ cat rename.sh
#!/bin/bash
# Loop through all files containing "ItemID="
for file in *ItemID=*; do
# Check if the file exists to avoid errors if no match is found
[ -e "$file" ] || continue
# Extract the ID: everything after the last "=" sign
item_id="${file##*=}"
# Construct the new filename
new_name="${item_id}.html"
# Perform the rename
echo "Renaming: '$file' -> '$new_name'"
mv "$file" "$new_name"
doneRegex in Sublime:
replace
<a class="button" href="https://treeid\.arborday\.org/WhatTree\.cfm\?ItemID=([a-zA-Z0-9]+)">
with
<a class="button" href="\1.html">
Regex in Sublime:
<a href="WhatTree\.cfm\?ItemID=([a-zA-Z0-9]+)">
with
<a href="#\1">
Regex in Sublime:
<a class="button" href="WhatTree\.cfm\?ItemID=([a-zA-Z0-9]+)">
with
<a class="button" href="#\1">
Inject a named anchor at the top of each HTML page
for f in *.html; do
name="${f%.html}";
echo "<h2 id=\"${name}\">${name}</H2>" | cat - "$f" > temp && mv temp "$f";
doneand
for f in *.html; do
name="${f%.html}";
echo "<HR>" | cat - "$f" > temp && mv temp "$f";
doneMerge all files into a single HTML:
for f in *.html; do
if [ "$f" != "index.html" ]; then
cat "$f" >> index.html
fi
doneConvert jpg to base64:
for file in *.jpg; do
base64 -i "${file}" > "${file}.base64"
doneReplace image pointers to file with base64 content:
<img src="_STRING_HERE">
Insert base64:
#!/usr/bin/env python3
import re
import os
zones=True # True for gif, False for jpg
input_html = 'index.html'
output_html = 'index_embedded.html' # Saves to a new file to not overwrite the original
def replace_with_base64(match):
# match.group(1) is the file path, e.g., 'graphics/W29C.jpg'
img_path = match.group(1)
base64_path = f"{img_path}.base64"
# Check if the .base64 file exists
if os.path.exists(base64_path):
with open(base64_path, 'r', encoding='utf-8') as b64_file:
# Read and strip any whitespace/newlines from the base64 string
b64_data = b64_file.read().strip()
print(f"Replaced: {img_path}")
# Return the new src attribute formatted for base64 inline images
if not zones:
return f'src="data:image/jpg;base64,{b64_data}"'
else:
return f'src="data:image/gif;base64,{b64_data}"'
else:
print(f"Warning: Base64 file not found for {img_path}")
return match.group(0) # Return the original string if file is missing
if __name__ == "__main__":
if not os.path.exists(input_html):
print(f"Error: {input_html} not found.")
with open(input_html, 'r', encoding='utf-8') as file_handle:
html_content = file_handle.read()
# Regex to find src="graphics/anything.jpg"
if not zones:
pattern = r'src="(graphics/[a-zA-Z0-9]+\.jpg)"'
else:
pattern = r'src="(trees/graphics/zones/[a-zA-Z0-9-]+\.gif)"'
# re.sub calls the replace_with_base64 function for every match
new_html = re.sub(pattern, replace_with_base64, html_content)
# Write the modified HTML to the new file
with open(output_html, 'w', encoding='utf-8') as f:
f.write(new_html)
print(f"\nDone! The new HTML file is saved as: {output_html}")