Created
September 19, 2025 16:49
-
-
Save guilsa/873b32a78ac3397e96f969d8870f1dbf to your computer and use it in GitHub Desktop.
HTML downloader script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # A script to download the HTML content from a list of URLs. | |
| # How to use: `chmod +x fetch-html.sh` | |
| # ./fetch-html.sh urls.txt ./downloaded_pages | |
| # This will read the links from `urls.txt` and save the corresponding HTML files into the `downloaded_pages` directory. Let me know if you'd like any tweaks. | |
| # I use Link Gopher on Firefox to manually create my links. | |
| set -euo pipefail # Fail fast on errors | |
| # --- Functions for usage and error handling --- | |
| function print_usage() { | |
| echo "Usage: $(basename "$0") <url_file> <destination_directory>" | |
| echo "" | |
| echo "Arguments:" | |
| echo " <url_file> A text file with one URL per line." | |
| echo " <destination_directory> The directory where HTML files will be saved." | |
| } | |
| # --- Argument Validation --- | |
| if [[ "$#" -ne 2 ]] | |
| then | |
| echo "Error: Incorrect number of arguments." >&2 | |
| print_usage | |
| exit 1 | |
| fi | |
| URL_FILE="$1" | |
| DEST_DIR="$2" | |
| if [[ ! -f "$URL_FILE" ]] | |
| then | |
| echo "Error: Input file '$URL_FILE' not found or is not a regular file." >&2 | |
| exit 1 | |
| fi | |
| # --- Main Logic --- | |
| # Create destination directory if it doesn't exist. | |
| # The -p flag prevents errors if the directory already exists. | |
| mkdir -p "$DEST_DIR" | |
| # Check if the destination is actually a directory now. | |
| if [[ ! -d "$DEST_DIR" ]] | |
| then | |
| echo "Error: Could not create destination directory '$DEST_DIR'." >&2 | |
| exit 1 | |
| fi | |
| # Process the file, reading each URL line-by-line. | |
| while IFS= read -r url || [[ -n "$url" ]] | |
| do | |
| # Skip empty or blank lines | |
| if [[ -z "${url// }" ]] | |
| then | |
| continue | |
| fi | |
| echo "Processing URL: $url" | |
| # Generate a safe filename from the URL to avoid path issues. | |
| # 1. Remove the protocol (e.g., https://). | |
| # 2. Replace any character that is NOT a letter, number, dot, underscore, or hyphen with an underscore. | |
| filename=$(echo "$url" | sed -e 's|^[^/]*//||' -e 's|/$||' | tr -c 'a-zA-Z0-9._-' '_') | |
| output_path="${DEST_DIR}/${filename}.html" | |
| # Download using curl. | |
| # -L: Follow redirects (like HTTP 301). | |
| # -s: Silent mode (no progress meter). | |
| # --fail: Exit with an error if the HTTP request fails (e.g., 404 Not Found). | |
| # -o: Specify the output file. | |
| if curl -Ls --fail "$url" -o "$output_path" | |
| then | |
| echo " -> Saved to $output_path" | |
| else | |
| # Grab the exit code from curl | |
| error_code=$? | |
| echo " -> FAILED to download from $url (curl exited with code: $error_code)" >&2 | |
| # Clean up the empty file that curl might have created on failure. | |
| rm -f "$output_path" | |
| fi | |
| done <"$URL_FILE" | |
| echo "" | |
| echo "All done." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment