Last active
September 5, 2025 06:17
-
-
Save mawiseman/ee7a2959db9738db3c048907523c60f2 to your computer and use it in GitHub Desktop.
Generates a Sitemap of a site by scraping all the URLs on each page it finds. .\generate-sitemap.ps1 https://yousite.com
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <# | |
| .SYNOPSIS | |
| Generates a sitemap.xml file for a website by crawling its pages concurrently. | |
| .DESCRIPTION | |
| This script starts crawling from a specified base URL, discovers | |
| internal links, and maintains a list of found (unprocessed) and | |
| processed URLs in text files. It filters URLs to ensure only | |
| those belonging to the same domain are processed and allows | |
| excluding URLs that contain specific path patterns. | |
| It processes up to 5 pages concurrently using PowerShell jobs. | |
| Output files are saved in a subfolder named after the initial URL. | |
| .PARAMETER BaseUrl | |
| The starting URL for the website crawl (e.g., "https://www.example.com"). | |
| This URL must be absolute. | |
| .PARAMETER OutputDirectory | |
| The base directory where the sitemap output folders will be created. | |
| A subfolder named after the sanitized BaseUrl will be created inside this. | |
| Defaults to ".\SitemapOutput". | |
| .PARAMETER ExcludePatterns | |
| An array of strings. Any URL containing one of these strings in its path | |
| will be excluded from processing and the sitemap. | |
| Example: @("/-/media/", "/assets/", ".pdf") | |
| .PARAMETER MaxConcurrentPages | |
| The maximum number of pages to process concurrently. Defaults to 5. | |
| .PARAMETER DelayBetweenBatches | |
| Delay in milliseconds between processing batches to avoid overwhelming the server. Defaults to 500ms. | |
| .PARAMETER RequestTimeoutSeconds | |
| Timeout in seconds for HTTP requests. Defaults to 30 seconds. | |
| .PARAMETER MaxRetries | |
| Maximum number of retry attempts for failed requests. Defaults to 2. | |
| .PARAMETER UserAgent | |
| User-Agent string to identify the crawler. Defaults to "SitemapGenerator/1.0". | |
| .PARAMETER AutoContinue | |
| Automatically continue from previous run without prompting. | |
| .PARAMETER ForceRestart | |
| Force a fresh start, clearing previous data. | |
| .NOTES | |
| Author: Your Name/AI Assistant | |
| Version: 1.3 | |
| Date: July 4, 2025 | |
| Requires PowerShell 5.1 or later. | |
| Be mindful of the website's robots.txt and server load when running this script. | |
| Consider adding delays between requests for large sites. | |
| #> | |
| param( | |
| [Parameter(Mandatory=$true)] | |
| [string]$BaseUrl, | |
| [string]$OutputDirectory = ".\SitemapOutput", | |
| [string[]]$ExcludePatterns = @("page-not-found", "/-/media/", ".pdf", ".zip", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx", ".jpg", ".jpeg", ".png", ".gif", ".svg", ".css", ".js", "#"), | |
| [int]$MaxConcurrentPages = 5, | |
| [int]$DelayBetweenBatches = 500, | |
| [int]$RequestTimeoutSeconds = 30, | |
| [int]$MaxRetries = 2, | |
| [string]$UserAgent = "SitemapGenerator/1.0 (PowerShell; +https://github.com/sitemap-generator)", | |
| [switch]$AutoContinue, | |
| [switch]$ForceRestart | |
| ) | |
| # --- Configuration and Initialization --- | |
| # Ensure the base URL ends with a slash if it's just a domain, for consistent parsing | |
| if ($BaseUrl -notmatch '/$') { | |
| $BaseUrl = "$BaseUrl/" | |
| } | |
| # Create a URI object for the base URL to easily get host and scheme | |
| try { | |
| $BaseUri = New-Object System.Uri($BaseUrl) | |
| $BaseHost = $BaseUri.Host.ToLowerInvariant() | |
| $BaseScheme = $BaseUri.Scheme.ToLowerInvariant() | |
| } catch { | |
| Write-Error "Invalid BaseUrl provided: '$BaseUrl'. Please provide a valid absolute URL." | |
| exit 1 | |
| } | |
| # Sanitize the BaseUrl to create a valid folder name | |
| # Replace scheme and domain parts, then replace invalid path chars with underscores | |
| $SanitizedUrlFolderName = $BaseUrl.Replace("://", "__") | |
| $SanitizedUrlFolderName = $SanitizedUrlFolderName.Replace("/", "_") | |
| $SanitizedUrlFolderName = $SanitizedUrlFolderName.TrimEnd('_') # Remove trailing underscore if any | |
| # Remove any characters that are invalid for file paths in Windows | |
| $invalidChars = [System.IO.Path]::GetInvalidFileNameChars() + [System.IO.Path]::GetInvalidPathChars() | |
| foreach ($char in $invalidChars) { | |
| $SanitizedUrlFolderName = $SanitizedUrlFolderName.Replace($char, '_') | |
| } | |
| # Ensure it's not too long for a path | |
| if ($SanitizedUrlFolderName.Length -gt 100) { # Limit length to avoid path issues | |
| $SanitizedUrlFolderName = $SanitizedUrlFolderName.Substring(0, 100) + "_truncated" | |
| } | |
| $CurrentOutputDirectory = Join-Path $OutputDirectory $SanitizedUrlFolderName | |
| $FoundUrlsFile = Join-Path $CurrentOutputDirectory "found-urls.txt" | |
| $ProcessedUrlsFile = Join-Path $CurrentOutputDirectory "processed-urls.txt" | |
| $SitemapFile = Join-Path $CurrentOutputDirectory "sitemap.xml" | |
| $Error404File = Join-Path $CurrentOutputDirectory "404-errors.txt" | |
| # Create output directory if it doesn't exist | |
| if (-not (Test-Path $CurrentOutputDirectory -PathType Container)) { | |
| New-Item -Path $CurrentOutputDirectory -ItemType Directory -Force | Out-Null | |
| Write-Host "Created output directory: $CurrentOutputDirectory" | |
| } | |
| # Check if previous run files exist and handle continuation | |
| $foundUrlsExist = Test-Path $FoundUrlsFile | |
| $processedUrlsExist = Test-Path $ProcessedUrlsFile | |
| $error404Exist = Test-Path $Error404File | |
| $previousRunExists = $foundUrlsExist -or $processedUrlsExist -or $error404Exist | |
| # Use a Hashtable for quick lookups of processed URLs | |
| $ProcessedUrls = @{} | |
| # Use a Hashtable to track all URLs that have been found (either processed or in queue) | |
| $AllFoundUrls = @{} | |
| # Use a Queue for URLs to process (FIFO) | |
| $UrlsToProcess = [System.Collections.Generic.Queue[string]]::new() | |
| # Track 404 errors with source information | |
| $Error404Urls = @{} | |
| # Track which page each URL was found on | |
| $UrlSources = @{} | |
| $continueFromPrevious = $false | |
| if ($previousRunExists -and -not $ForceRestart) { | |
| if ($AutoContinue) { | |
| $continueFromPrevious = $true | |
| Write-Host "Auto-continuing from previous run..." -ForegroundColor Cyan | |
| } else { | |
| Write-Host "`nPrevious run detected!" -ForegroundColor Yellow | |
| Write-Host "Found files:" -ForegroundColor Gray | |
| if ($foundUrlsExist) { | |
| $foundCount = (Get-Content $FoundUrlsFile | Measure-Object).Count | |
| Write-Host " - found-urls.txt: $foundCount URLs" -ForegroundColor Gray | |
| } | |
| if ($processedUrlsExist) { | |
| $processedCount = (Get-Content $ProcessedUrlsFile | Measure-Object).Count | |
| Write-Host " - processed-urls.txt: $processedCount URLs" -ForegroundColor Gray | |
| } | |
| if ($error404Exist) { | |
| $error404Count = (Get-Content $Error404File | Measure-Object).Count | |
| Write-Host " - 404-errors.txt: $error404Count URLs" -ForegroundColor Gray | |
| } | |
| Write-Host "`nWould you like to:" -ForegroundColor Cyan | |
| Write-Host " [C] Continue from where the previous run left off" -ForegroundColor Green | |
| Write-Host " [R] Restart fresh (clear previous data)" -ForegroundColor Yellow | |
| Write-Host " [Q] Quit" -ForegroundColor Red | |
| do { | |
| $choice = Read-Host "Enter your choice (C/R/Q)" | |
| $choice = $choice.ToUpper() | |
| } while ($choice -notin @('C', 'R', 'Q')) | |
| switch ($choice) { | |
| 'C' { | |
| $continueFromPrevious = $true | |
| Write-Host "Continuing from previous run..." -ForegroundColor Green | |
| } | |
| 'R' { | |
| $continueFromPrevious = $false | |
| Write-Host "Starting fresh..." -ForegroundColor Yellow | |
| } | |
| 'Q' { | |
| Write-Host "Exiting..." -ForegroundColor Red | |
| exit 0 | |
| } | |
| } | |
| } | |
| } elseif ($ForceRestart) { | |
| $continueFromPrevious = $false | |
| Write-Host "Force restart specified. Starting fresh..." -ForegroundColor Yellow | |
| } | |
| if ($continueFromPrevious) { | |
| # Load existing URLs from files | |
| Write-Host "Loading previous run data..." -ForegroundColor Cyan | |
| if ($processedUrlsExist) { | |
| $existingProcessed = Get-Content $ProcessedUrlsFile | |
| foreach ($url in $existingProcessed) { | |
| if (-not [string]::IsNullOrWhiteSpace($url)) { | |
| $ProcessedUrls[$url] = $true | |
| } | |
| } | |
| Write-Host " Loaded $($ProcessedUrls.Count) processed URLs" -ForegroundColor Gray | |
| } | |
| if ($error404Exist) { | |
| $existing404 = Get-Content $Error404File | |
| foreach ($line in $existing404) { | |
| if (-not [string]::IsNullOrWhiteSpace($line)) { | |
| # Parse the format: "URL | Found on: SourceURL" | |
| if ($line -match '^(.+?)\s*\|\s*Found on:\s*(.+)$') { | |
| $url404 = $matches[1].Trim() | |
| $sourceUrl = $matches[2].Trim() | |
| $Error404Urls[$url404] = $sourceUrl | |
| } else { | |
| # Handle old format (just URL) | |
| $Error404Urls[$line] = "Unknown source" | |
| } | |
| } | |
| } | |
| Write-Host " Loaded $($Error404Urls.Count) 404 error URLs" -ForegroundColor Gray | |
| } | |
| if ($foundUrlsExist) { | |
| $existingFound = Get-Content $FoundUrlsFile | |
| $unprocessedUrls = @() | |
| foreach ($url in $existingFound) { | |
| if (-not [string]::IsNullOrWhiteSpace($url)) { | |
| $AllFoundUrls[$url] = $true | |
| # Add to queue only if not already processed | |
| if (-not $ProcessedUrls.ContainsKey($url)) { | |
| $unprocessedUrls += $url | |
| } | |
| } | |
| } | |
| # Enqueue unprocessed URLs | |
| foreach ($url in $unprocessedUrls) { | |
| $UrlsToProcess.Enqueue($url) | |
| } | |
| Write-Host " Loaded $($AllFoundUrls.Count) found URLs" -ForegroundColor Gray | |
| Write-Host " Queued $($unprocessedUrls.Count) unprocessed URLs" -ForegroundColor Green | |
| } | |
| # If queue is empty but we have the base URL, start with it | |
| if ($UrlsToProcess.Count -eq 0 -and -not $AllFoundUrls.ContainsKey($BaseUrl)) { | |
| $UrlsToProcess.Enqueue($BaseUrl) | |
| if (-not [string]::IsNullOrWhiteSpace($BaseUrl)) { | |
| Add-Content -Path $FoundUrlsFile -Value $BaseUrl | |
| } | |
| $AllFoundUrls[$BaseUrl] = $true | |
| $UrlSources[$BaseUrl] = "Initial URL" | |
| Write-Host " Added base URL to queue: $BaseUrl" -ForegroundColor Green | |
| } | |
| Write-Host "Resuming crawl..." -ForegroundColor Green | |
| } else { | |
| # Clear previous run data or initialize files | |
| # Create empty files without any content (no blank lines) | |
| New-Item -Path $FoundUrlsFile -ItemType File -Force | Out-Null | |
| New-Item -Path $ProcessedUrlsFile -ItemType File -Force | Out-Null | |
| New-Item -Path $Error404File -ItemType File -Force | Out-Null | |
| Write-Host "Initialized fresh found-urls.txt, processed-urls.txt, and 404-errors.txt." -ForegroundColor Green | |
| # Add the initial URL to the queue, found file, and AllFoundUrls tracker | |
| $UrlsToProcess.Enqueue($BaseUrl) | |
| if (-not [string]::IsNullOrWhiteSpace($BaseUrl)) { | |
| Add-Content -Path $FoundUrlsFile -Value $BaseUrl | |
| } | |
| $AllFoundUrls[$BaseUrl] = $true # Mark as found | |
| $UrlSources[$BaseUrl] = "Initial URL" # Mark as the starting point | |
| Write-Host "Starting fresh crawl from: $BaseUrl" -ForegroundColor Green | |
| } | |
| # --- Helper Functions (defined in parent scope for initial checks) --- | |
| function Test-IsInternalUrl { | |
| <# | |
| .SYNOPSIS | |
| Checks if a given URL is internal to the base domain. | |
| .PARAMETER Url | |
| The URL string to check. | |
| #> | |
| param( | |
| [string]$Url | |
| ) | |
| try { | |
| $uri = New-Object System.Uri($Url) | |
| # Check if scheme and host match the base URL's scheme and host | |
| return ($uri.Scheme.ToLowerInvariant() -eq $BaseScheme -and $uri.Host.ToLowerInvariant() -eq $BaseHost) | |
| } catch { | |
| # If it's not a valid URI, it's likely not an internal, absolute URL | |
| return $false | |
| } | |
| } | |
| function Test-IsExcludedUrl { | |
| <# | |
| .SYNOPSIS | |
| Checks if a given URL should be excluded based on defined patterns. | |
| .PARAMETER Url | |
| The URL string to check. | |
| #> | |
| param( | |
| [string]$Url | |
| ) | |
| foreach ($pattern in $ExcludePatterns) { | |
| if ($Url -like "*$pattern*") { | |
| return $true | |
| } | |
| } | |
| return $false | |
| } | |
| # --- Main Crawling Logic --- | |
| # Initialize progress tracking variables | |
| $totalProcessed = $ProcessedUrls.Count | |
| $totalFound = $AllFoundUrls.Count | |
| $batchNumber = 0 | |
| if ($totalProcessed -gt 0) { | |
| Write-Host "`nResuming with $totalProcessed already processed out of $totalFound found URLs" -ForegroundColor Cyan | |
| } | |
| while ($UrlsToProcess.Count -gt 0) { | |
| $currentBatchUrls = New-Object System.Collections.Generic.List[string] | |
| $jobs = @() | |
| # Dequeue up to MaxConcurrentPages URLs for this batch | |
| for ($i = 0; $i -lt $MaxConcurrentPages -and $UrlsToProcess.Count -gt 0; $i++) { | |
| $url = $UrlsToProcess.Dequeue() | |
| # Only add to batch if not already processed, excluded, or blank | |
| if (-not $ProcessedUrls.ContainsKey($url) -and -not (Test-IsExcludedUrl -Url $url) -and -not [string]::IsNullOrWhiteSpace($url)) { | |
| [void]$currentBatchUrls.Add($url) | |
| } | |
| } | |
| if ($currentBatchUrls.Count -eq 0) { | |
| # If the batch is empty (e.g., all dequeued URLs were already processed/excluded), continue to next iteration | |
| continue | |
| } | |
| $batchNumber++ | |
| $batchStartIndex = $totalProcessed + 1 | |
| $batchEndIndex = $totalProcessed + $currentBatchUrls.Count | |
| Write-Host "`nBatch ${batchNumber}: Processing URLs $batchStartIndex-$batchEndIndex of $totalFound found" -ForegroundColor Yellow | |
| Write-Host " Queue remaining: $($UrlsToProcess.Count) | Already processed: $totalProcessed" -ForegroundColor Gray | |
| foreach ($urlToProcess in $currentBatchUrls) { | |
| # Add to processed list and file immediately before starting job | |
| # This marks it as 'being processed' and prevents it from being picked up again. | |
| if (-not $ProcessedUrls.ContainsKey($urlToProcess) -and -not [string]::IsNullOrWhiteSpace($urlToProcess)) { | |
| $ProcessedUrls[$urlToProcess] = $true | |
| Add-Content -Path $ProcessedUrlsFile -Value $urlToProcess | |
| $currentUrlIndex = $batchStartIndex + $currentBatchUrls.IndexOf($urlToProcess) | |
| Write-Host " [$currentUrlIndex/$totalFound] Starting: $urlToProcess" -ForegroundColor Cyan | |
| } else { | |
| # This should ideally not happen if the initial dequeue check is perfect, but good for robustness | |
| Write-Warning " Skipping job for '$urlToProcess' as it was already marked as processed." | |
| continue | |
| } | |
| # Start a job for each URL in the batch | |
| # Pass necessary parameters for internal URL checking and exclusion to the job's scope | |
| $job = Start-Job -ScriptBlock { | |
| param($url, $baseScheme, $baseHost, $excludePatterns, $baseUriAbsoluteUri, $userAgent, $timeoutSeconds, $maxRetries) | |
| # Re-define helper functions within the job's scope as jobs run in isolation | |
| function Test-IsInternalUrlInJob { | |
| param([string]$Url) | |
| try { | |
| $uri = New-Object System.Uri($Url) | |
| return ($uri.Scheme.ToLowerInvariant() -eq $baseScheme -and $uri.Host.ToLowerInvariant() -eq $baseHost) | |
| } catch { | |
| return $false | |
| } | |
| } | |
| function Test-IsExcludedUrlInJob { | |
| param([string]$Url) | |
| foreach ($pattern in $excludePatterns) { | |
| if ($Url -like "*$pattern*") { | |
| return $true | |
| } | |
| } | |
| return $false | |
| } | |
| function Resolve-RelativeUrlInJob { | |
| param([string]$CurrentPageUrl, [string]$RelativeUrl) | |
| try { | |
| $base = New-Object System.Uri($CurrentPageUrl) | |
| $resolved = New-Object System.Uri($base, $RelativeUrl) | |
| return $resolved.AbsoluteUri | |
| } catch { | |
| return $null | |
| } | |
| } | |
| $foundLinks = @() | |
| $is404Error = $false | |
| $statusCode = 0 | |
| $retryCount = 0 | |
| $success = $false | |
| $response = $null | |
| # Normalize the URL for consistent processing (e.g., remove trailing slash if not root) | |
| if ($url -ne $baseUriAbsoluteUri -and $url.EndsWith('/')) { | |
| $url = $url.TrimEnd('/') | |
| } | |
| # Retry logic with exponential backoff | |
| while (-not $success -and $retryCount -le $maxRetries) { | |
| try { | |
| # Create headers with User-Agent | |
| $headers = @{ | |
| 'User-Agent' = $userAgent | |
| } | |
| $response = Invoke-WebRequest -Uri $url -UseBasicParsing -Headers $headers -TimeoutSec $timeoutSeconds -ErrorAction Stop | |
| $statusCode = $response.StatusCode | |
| $success = $true | |
| if ($response.StatusCode -eq 200) { | |
| $links = $response.Links | Select-Object -ExpandProperty href -ErrorAction SilentlyContinue | |
| foreach ($link in $links) { | |
| if ([string]::IsNullOrEmpty($link)) { | |
| continue | |
| } | |
| $absoluteLink = $null | |
| if ($link.StartsWith("http://") -or $link.StartsWith("https://")) { | |
| $absoluteLink = $link | |
| } else { | |
| $absoluteLink = Resolve-RelativeUrlInJob -CurrentPageUrl $url -RelativeUrl $link | |
| } | |
| # Normalize the resolved link (e.g., remove trailing slash if not root) | |
| if ($absoluteLink -ne $baseUriAbsoluteUri -and $absoluteLink.EndsWith('/')) { | |
| $absoluteLink = $absoluteLink.TrimEnd('/') | |
| } | |
| # Check if it's an internal URL and not excluded | |
| if ($absoluteLink -and (Test-IsInternalUrlInJob -Url $absoluteLink) -and -not (Test-IsExcludedUrlInJob -Url $absoluteLink)) { | |
| # Don't check AllFoundUrls here; the parent script will handle global uniqueness | |
| # Store the link with its source URL | |
| $foundLinks += [PSCustomObject]@{ | |
| Url = $absoluteLink | |
| SourceUrl = $url | |
| } | |
| } | |
| } | |
| } elseif ($response.StatusCode -eq 404) { | |
| $is404Error = $true | |
| Write-Warning "Job for '$url': 404 Not Found" | |
| } else { | |
| Write-Warning "Job for '$url': Failed to retrieve. Status Code: $($response.StatusCode)" | |
| } | |
| } catch { | |
| $retryCount++ | |
| # Check if it's a 404 error in the exception | |
| if ($_.Exception.Response.StatusCode -eq 404 -or $_.Exception.Message -like "*404*") { | |
| $is404Error = $true | |
| $statusCode = 404 | |
| $success = $true # Don't retry 404s | |
| Write-Warning "Job for '$url': 404 Not Found" | |
| } elseif ($retryCount -le $maxRetries) { | |
| # Exponential backoff: wait 1s, 2s, 4s, etc. | |
| $waitTime = [math]::Pow(2, $retryCount - 1) | |
| Write-Warning "Job for '$url': Attempt $retryCount failed. Retrying in ${waitTime}s... Error: $($_.Exception.Message)" | |
| Start-Sleep -Seconds $waitTime | |
| } else { | |
| # Max retries exceeded | |
| Write-Warning "Job for '$url': Max retries ($maxRetries) exceeded. Final error: $($_.Exception.Message)" | |
| $success = $true # Stop trying | |
| } | |
| } | |
| } | |
| # Return the URL that was processed by this job and the unique links it found | |
| [PSCustomObject]@{ | |
| ProcessedUrl = $url | |
| NewLinks = $foundLinks # Links now include source information | |
| Is404 = $is404Error | |
| StatusCode = $statusCode | |
| } | |
| } -ArgumentList $urlToProcess, $BaseScheme, $BaseHost, $ExcludePatterns, $BaseUri.AbsoluteUri, $UserAgent, $RequestTimeoutSeconds, $MaxRetries | |
| $jobs += $job | |
| } | |
| # Wait for all jobs in the current batch to complete | |
| Write-Host " Waiting for batch to complete..." -ForegroundColor DarkCyan | |
| Wait-Job -Job $jobs | Out-Null | |
| # Collect results and update central state | |
| $newUrlsFoundInBatch = 0 | |
| $errors404InBatch = 0 | |
| foreach ($job in $jobs) { | |
| $result = Receive-Job -Job $job # Don't keep job results to save memory | |
| Remove-Job -Job $job -Force | Out-Null # Clean up job immediately | |
| if ($result) { | |
| # Check if this URL returned a 404 error | |
| if ($result.Is404) { | |
| if (-not $Error404Urls.ContainsKey($result.ProcessedUrl)) { | |
| # Get the source page for this 404 URL | |
| $sourcePageFor404 = if ($UrlSources.ContainsKey($result.ProcessedUrl)) { | |
| $UrlSources[$result.ProcessedUrl] | |
| } else { | |
| "Unknown source" | |
| } | |
| $Error404Urls[$result.ProcessedUrl] = $sourcePageFor404 | |
| $error404Entry = "$($result.ProcessedUrl) | Found on: $sourcePageFor404" | |
| Add-Content -Path $Error404File -Value $error404Entry | |
| $errors404InBatch++ | |
| Write-Host " 404 ERROR: $($result.ProcessedUrl)" -ForegroundColor Red | |
| Write-Host " --> Found on: $sourcePageFor404" -ForegroundColor DarkRed | |
| } | |
| } | |
| # The URL processed by this job was already added to $ProcessedUrls earlier | |
| $newLinksFromJob = $result.NewLinks | |
| foreach ($linkInfo in $newLinksFromJob) { | |
| $newLink = if ($linkInfo.Url) { $linkInfo.Url } else { $linkInfo } | |
| $sourceUrl = if ($linkInfo.SourceUrl) { $linkInfo.SourceUrl } else { $result.ProcessedUrl } | |
| # Only add to queue and found file if not already globally found | |
| if (-not $AllFoundUrls.ContainsKey($newLink) -and -not [string]::IsNullOrWhiteSpace($newLink)) { | |
| $UrlsToProcess.Enqueue($newLink) | |
| Add-Content -Path $FoundUrlsFile -Value $newLink | |
| $AllFoundUrls[$newLink] = $true | |
| # Track where this URL was found | |
| $UrlSources[$newLink] = $sourceUrl | |
| $newUrlsFoundInBatch++ | |
| $totalFound++ | |
| } | |
| } | |
| } | |
| } | |
| # Update total processed count | |
| $totalProcessed += $currentBatchUrls.Count | |
| # Display batch summary | |
| $summaryMessage = " Batch $batchNumber complete: Processed $($currentBatchUrls.Count) URLs, found $newUrlsFoundInBatch new URLs" | |
| if ($errors404InBatch -gt 0) { | |
| $summaryMessage += ", $errors404InBatch 404 errors" | |
| } | |
| Write-Host $summaryMessage -ForegroundColor Green | |
| $percentComplete = if ($totalFound -gt 0) { [math]::Round(($totalProcessed / $totalFound) * 100, 1) } else { 0 } | |
| $progressMsg = " Total progress: $totalProcessed/$totalFound processed (" + $percentComplete + '%)' | |
| Write-Host $progressMsg -ForegroundColor Magenta | |
| # Add configurable delay between batches to avoid overwhelming the server | |
| if ($DelayBetweenBatches -gt 0 -and $UrlsToProcess.Count -gt 0) { | |
| Write-Host " Waiting $DelayBetweenBatches ms before next batch..." -ForegroundColor DarkGray | |
| Start-Sleep -Milliseconds $DelayBetweenBatches | |
| } | |
| # Memory optimization: Force garbage collection every 10 batches for large crawls | |
| if ($batchNumber % 10 -eq 0 -and $totalProcessed -gt 50) { | |
| [System.GC]::Collect() | |
| [System.GC]::WaitForPendingFinalizers() | |
| } | |
| } | |
| Write-Host "`nCrawl complete!" -ForegroundColor Green | |
| Write-Host "Final statistics: Processed $totalProcessed URLs out of $totalFound found URLs" -ForegroundColor Cyan | |
| if ($Error404Urls.Count -gt 0) { | |
| Write-Host "Found $($Error404Urls.Count) 404 errors (see 404-errors.txt for details)" -ForegroundColor Red | |
| } | |
| Write-Host "Generating sitemap.xml..." -ForegroundColor Green | |
| # --- Sitemap XML Generation --- | |
| $processedUrlsList = Get-Content -Path $ProcessedUrlsFile | Sort-Object -Unique | |
| $xmlHeader = '<?xml version="1.0" encoding="UTF-8"?>' | |
| $urlsetStart = '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' | |
| $urlsetEnd = '</urlset>' | |
| $sitemapContent = New-Object System.Text.StringBuilder | |
| [void]$sitemapContent.AppendLine($xmlHeader) | |
| [void]$sitemapContent.AppendLine($urlsetStart) | |
| foreach ($url in $processedUrlsList) { | |
| [void]$sitemapContent.AppendLine(' <url>') | |
| [void]$sitemapContent.AppendLine(" <loc>$url</loc>") | |
| # You can add lastmod, changefreq, priority if you have this data | |
| # [void]$sitemapContent.AppendLine(" <lastmod>$(Get-Date -Format 'yyyy-MM-dd')</lastmod>") | |
| # [void]$sitemapContent.AppendLine(' <changefreq>daily</changefreq>') | |
| # [void]$sitemapContent.AppendLine(' <priority>0.8</priority>') | |
| [void]$sitemapContent.AppendLine(' </url>') | |
| } | |
| [void]$sitemapContent.AppendLine($urlsetEnd) | |
| $sitemapContent.ToString() | Set-Content -Path $SitemapFile -Encoding UTF8 | |
| Write-Host "Sitemap generated successfully at: $SitemapFile" -ForegroundColor Green | |
| Write-Host "Processed URLs can be found in: $ProcessedUrlsFile" -ForegroundColor Green | |
| Write-Host "Unprocessed (found) URLs can be found in: $FoundUrlsFile" -ForegroundColor Green | |
| if ($Error404Urls.Count -gt 0) { | |
| Write-Host "404 error URLs can be found in: $Error404File" -ForegroundColor Red | |
| } | |
| Write-Host "Output files are located in: $CurrentOutputDirectory" -ForegroundColor Green |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment