guilsa/fetch-html.sh

## fetch-html.sh
#!/bin/bash

# A script to download the HTML content from a list of URLs.

# How to use: `chmod +x fetch-html.sh`
# ./fetch-html.sh urls.txt ./downloaded_pages
# This will read the links from `urls.txt` and save the corresponding HTML files into the `downloaded_pages` directory. Let me know if you'd like any tweaks.
# I use Link Gopher on Firefox to manually create my links.

set -euo pipefail # Fail fast on errors

# --- Functions for usage and error handling ---
function print_usage() {
	echo "Usage: $(basename "$0") <url_file> <destination_directory>"
	echo ""
	echo "Arguments:"
	echo "  <url_file>              A text file with one URL per line."
	echo "  <destination_directory> The directory where HTML files will be saved."
}

# --- Argument Validation ---
if [[ "$#" -ne 2 ]]
then
	echo "Error: Incorrect number of arguments." >&2
	print_usage
	exit 1
fi

URL_FILE="$1"
DEST_DIR="$2"

if [[ ! -f "$URL_FILE" ]]
then
	echo "Error: Input file '$URL_FILE' not found or is not a regular file." >&2
	exit 1
fi

# --- Main Logic ---
# Create destination directory if it doesn't exist.
# The -p flag prevents errors if the directory already exists.
mkdir -p "$DEST_DIR"

# Check if the destination is actually a directory now.
if [[ ! -d "$DEST_DIR" ]]
then
	echo "Error: Could not create destination directory '$DEST_DIR'." >&2
	exit 1
fi

# Process the file, reading each URL line-by-line.
while IFS= read -r url || [[ -n "$url" ]]
do
	# Skip empty or blank lines
	if [[ -z "${url// }" ]]
	then
		continue
	fi

	echo "Processing URL: $url"

	# Generate a safe filename from the URL to avoid path issues.
	# 1. Remove the protocol (e.g., https://).
	# 2. Replace any character that is NOT a letter, number, dot, underscore, or hyphen with an underscore.
	filename=$(echo "$url" | sed -e 's|^[^/]*//||' -e 's|/$||' | tr -c 'a-zA-Z0-9._-' '_')
	output_path="${DEST_DIR}/${filename}.html"

	# Download using curl.
	# -L: Follow redirects (like HTTP 301).
	# -s: Silent mode (no progress meter).
	# --fail: Exit with an error if the HTTP request fails (e.g., 404 Not Found).
	# -o: Specify the output file.
	if curl -Ls --fail "$url" -o "$output_path"
	then
		echo " -> Saved to $output_path"
	else
		# Grab the exit code from curl
		error_code=$?
		echo " -> FAILED to download from $url (curl exited with code: $error_code)" >&2
		# Clean up the empty file that curl might have created on failure.
		rm -f "$output_path"
	fi
done <"$URL_FILE"

echo ""
echo "All done."
	#!/bin/bash

	# A script to download the HTML content from a list of URLs.

	# How to use: `chmod +x fetch-html.sh`
	# ./fetch-html.sh urls.txt ./downloaded_pages
	# This will read the links from `urls.txt` and save the corresponding HTML files into the `downloaded_pages` directory. Let me know if you'd like any tweaks.
	# I use Link Gopher on Firefox to manually create my links.

	set -euo pipefail # Fail fast on errors

	# --- Functions for usage and error handling ---
	function print_usage() {
	echo "Usage: $(basename "$0") <url_file> <destination_directory>"
	echo ""
	echo "Arguments:"
	echo " <url_file> A text file with one URL per line."
	echo " <destination_directory> The directory where HTML files will be saved."
	}

	# --- Argument Validation ---
	if [[ "$#" -ne 2 ]]
	then
	echo "Error: Incorrect number of arguments." >&2
	print_usage
	exit 1
	fi

	URL_FILE="$1"
	DEST_DIR="$2"

	if [[ ! -f "$URL_FILE" ]]
	then
	echo "Error: Input file '$URL_FILE' not found or is not a regular file." >&2
	exit 1
	fi

	# --- Main Logic ---
	# Create destination directory if it doesn't exist.
	# The -p flag prevents errors if the directory already exists.
	mkdir -p "$DEST_DIR"

	# Check if the destination is actually a directory now.
	if [[ ! -d "$DEST_DIR" ]]
	then
	echo "Error: Could not create destination directory '$DEST_DIR'." >&2
	exit 1
	fi

	# Process the file, reading each URL line-by-line.
	while IFS= read -r url \|\| [[ -n "$url" ]]
	do
	# Skip empty or blank lines
	if [[ -z "${url// }" ]]
	then
	continue
	fi

	echo "Processing URL: $url"

	# Generate a safe filename from the URL to avoid path issues.
	# 1. Remove the protocol (e.g., https://).
	# 2. Replace any character that is NOT a letter, number, dot, underscore, or hyphen with an underscore.
	filename=$(echo "$url" \| sed -e 's\|^[^/]*//\|\|' -e 's\|/$\|\|' \| tr -c 'a-zA-Z0-9._-' '_')
	output_path="${DEST_DIR}/${filename}.html"

	# Download using curl.
	# -L: Follow redirects (like HTTP 301).
	# -s: Silent mode (no progress meter).
	# --fail: Exit with an error if the HTTP request fails (e.g., 404 Not Found).
	# -o: Specify the output file.
	if curl -Ls --fail "$url" -o "$output_path"
	then
	echo " -> Saved to $output_path"
	else
	# Grab the exit code from curl
	error_code=$?
	echo " -> FAILED to download from $url (curl exited with code: $error_code)" >&2
	# Clean up the empty file that curl might have created on failure.
	rm -f "$output_path"
	fi
	done <"$URL_FILE"

	echo ""
	echo "All done."
No results found