Skip to content

Instantly share code, notes, and snippets.

@guilsa
Created September 19, 2025 16:49
Show Gist options
  • Select an option

  • Save guilsa/873b32a78ac3397e96f969d8870f1dbf to your computer and use it in GitHub Desktop.

Select an option

Save guilsa/873b32a78ac3397e96f969d8870f1dbf to your computer and use it in GitHub Desktop.
HTML downloader script
#!/bin/bash
# A script to download the HTML content from a list of URLs.
# How to use: `chmod +x fetch-html.sh`
# ./fetch-html.sh urls.txt ./downloaded_pages
# This will read the links from `urls.txt` and save the corresponding HTML files into the `downloaded_pages` directory. Let me know if you'd like any tweaks.
# I use Link Gopher on Firefox to manually create my links.
set -euo pipefail # Fail fast on errors
# --- Functions for usage and error handling ---
function print_usage() {
echo "Usage: $(basename "$0") <url_file> <destination_directory>"
echo ""
echo "Arguments:"
echo " <url_file> A text file with one URL per line."
echo " <destination_directory> The directory where HTML files will be saved."
}
# --- Argument Validation ---
if [[ "$#" -ne 2 ]]
then
echo "Error: Incorrect number of arguments." >&2
print_usage
exit 1
fi
URL_FILE="$1"
DEST_DIR="$2"
if [[ ! -f "$URL_FILE" ]]
then
echo "Error: Input file '$URL_FILE' not found or is not a regular file." >&2
exit 1
fi
# --- Main Logic ---
# Create destination directory if it doesn't exist.
# The -p flag prevents errors if the directory already exists.
mkdir -p "$DEST_DIR"
# Check if the destination is actually a directory now.
if [[ ! -d "$DEST_DIR" ]]
then
echo "Error: Could not create destination directory '$DEST_DIR'." >&2
exit 1
fi
# Process the file, reading each URL line-by-line.
while IFS= read -r url || [[ -n "$url" ]]
do
# Skip empty or blank lines
if [[ -z "${url// }" ]]
then
continue
fi
echo "Processing URL: $url"
# Generate a safe filename from the URL to avoid path issues.
# 1. Remove the protocol (e.g., https://).
# 2. Replace any character that is NOT a letter, number, dot, underscore, or hyphen with an underscore.
filename=$(echo "$url" | sed -e 's|^[^/]*//||' -e 's|/$||' | tr -c 'a-zA-Z0-9._-' '_')
output_path="${DEST_DIR}/${filename}.html"
# Download using curl.
# -L: Follow redirects (like HTTP 301).
# -s: Silent mode (no progress meter).
# --fail: Exit with an error if the HTTP request fails (e.g., 404 Not Found).
# -o: Specify the output file.
if curl -Ls --fail "$url" -o "$output_path"
then
echo " -> Saved to $output_path"
else
# Grab the exit code from curl
error_code=$?
echo " -> FAILED to download from $url (curl exited with code: $error_code)" >&2
# Clean up the empty file that curl might have created on failure.
rm -f "$output_path"
fi
done <"$URL_FILE"
echo ""
echo "All done."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment