Last active
January 12, 2026 01:50
-
-
Save teklynk/2696ed0ec5ee0498985aeb0baa02abaa to your computer and use it in GitHub Desktop.
Website Scraper script using HTTrack command line
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # Requires HTTrack to scrape the site and generate html files and assets. | |
| # sudo apt install httrack | |
| # Run ./static_site.sh from the command line. | |
| # Define source and destination paths and domain name | |
| httrackpath=/usr/bin/httrack | |
| domain=example.com | |
| proddomain=example-static.com | |
| temppath=/tmp | |
| staticsitepath=/static | |
| # Purge the temp/source directory | |
| rm -r $temppath/* | |
| sleep 3 | |
| # Run HTTrack | |
| $httrackpath https://$domain -O $temppath -v -a -w --footer "" | |
| sleep 3 | |
| # Find and remove HTTrack comment lines | |
| find $temppath -type f -name '*.html' -exec sed -i 's#<!-- Added by HTTrack --><meta http-equiv=\"content-type\" content=\"text\/html;charset=UTF-8\" \/><!-- \/Added by HTTrack -->#''#g' {} + | |
| sleep 3 | |
| # Find and replace local domain with production domain | |
| find $temppath -type f -name "*.html" -exec sed -i 's#'"$domain"'#'"$proddomain"'#g' {} + | |
| sleep 3 | |
| # Remove white space and line breaks | |
| find $temppath -type f -name "*.html" -exec sed -i '/^[[:space:]]*$/d' {} + | |
| # Purge the destination/static site directory | |
| rm -r $staticsitepath/* | |
| sleep 3 | |
| # Copy source/temp path to destination/static path | |
| cp -r $temppath/$domain/* $staticsitepath/ | |
| # Simply ftp/copy the static files to your hosted server. | |
| # This script can be useful for staging to production deployments. | |
| # Staging environment could have DocumentRoot set to htdocs/ (with access to the admin panel for editing), while the Production environment only uses htdocs/static. | |
| # This could also be used for production environments that do not have/allow PHP and MySQL, possibly for security concerns or cost. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment