|
# ========================================================== |
|
# Parallel Feed / URL Validator (PowerShell 7+) |
|
# Robust + LIVE console output (ACTUALLY streaming) |
|
# ========================================================== |
|
|
|
[CmdletBinding()] |
|
param( |
|
[string]$Path, |
|
[int]$ThrottleLimit = 12, |
|
[int]$MaxRetries = 2, |
|
[int]$TimeoutSec = 25, |
|
[switch]$NoGui, |
|
[switch]$ProbeHtmlForFeeds = $true, |
|
[switch]$WriteFixedJson = $true, |
|
[switch]$ShowLiveList = $true |
|
) |
|
|
|
$ErrorActionPreference = "Stop" |
|
|
|
Write-Host "`n=== Feed Source Validator (Robust) ===`n" -ForegroundColor Cyan |
|
|
|
function Resolve-InputPath { |
|
param([string]$Path, [switch]$NoGui) |
|
|
|
if ($Path -and (Test-Path $Path)) { return (Resolve-Path $Path).Path } |
|
|
|
if ($NoGui -or -not $IsWindows) { |
|
throw "No -Path supplied (or not found). Provide -Path to the JSON file." |
|
} |
|
|
|
Add-Type -AssemblyName System.Windows.Forms | Out-Null |
|
$dialog = New-Object System.Windows.Forms.OpenFileDialog |
|
$dialog.Filter = "JSON files (*.json)|*.json" |
|
$null = $dialog.ShowDialog() |
|
if (-not $dialog.FileName) { throw "No file selected." } |
|
return $dialog.FileName |
|
} |
|
|
|
$inputPath = Resolve-InputPath -Path $Path -NoGui:$NoGui |
|
|
|
# Load JSON |
|
try { |
|
$raw = Get-Content $inputPath -Raw -Encoding UTF8 |
|
$data = $raw | ConvertFrom-Json -Depth 50 |
|
} catch { |
|
throw "Failed to parse JSON: $inputPath. Error: $($_.Exception.Message)" |
|
} |
|
|
|
if (-not $data.sources) { throw "JSON file doesn't contain a 'sources' array. Path: $inputPath" } |
|
|
|
# Headers |
|
$headers = @{ |
|
"User-Agent" = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) FeedValidator/3.3" |
|
"Accept" = "application/rss+xml, application/atom+xml, application/xml;q=0.9, application/json;q=0.9, text/html;q=0.8, */*;q=0.7" |
|
"Accept-Language" = "en-US,en;q=0.9" |
|
"Cache-Control" = "no-cache" |
|
"Pragma" = "no-cache" |
|
} |
|
|
|
$maxRedir = 10 |
|
$total = @($data.sources).Count |
|
|
|
Write-Host ("Validating {0} sources with ThrottleLimit={1}`n" -f $total, $ThrottleLimit) -ForegroundColor DarkCyan |
|
|
|
# Live counters (main runspace) |
|
$Completed = 0 |
|
$okCount = 0 |
|
$warnCount = 0 |
|
$failCount = 0 |
|
$results = New-Object System.Collections.Generic.List[object] |
|
$sw = [System.Diagnostics.Stopwatch]::StartNew() |
|
$lastUiTick = 0 |
|
|
|
# ========================================================== |
|
# IMPORTANT: NO "$stream =" assignment. Stream results directly. |
|
# ========================================================== |
|
$data.sources | |
|
ForEach-Object -Parallel { |
|
|
|
# ---------------------------- |
|
# Helper functions (defined once per runspace) |
|
# ---------------------------- |
|
if (-not $script:__helpersLoaded) { |
|
|
|
function Normalize-Type { |
|
param([string]$t) |
|
if (-not $t) { return "web" } |
|
$t = $t.ToLowerInvariant().Trim() |
|
switch ($t) { |
|
"atom" { "rss" } |
|
"feed" { "rss" } |
|
default { $t } |
|
} |
|
} |
|
|
|
function Sniff-Type { |
|
param([string]$Content, [string]$ContentTypeHeader, [string]$Url) |
|
|
|
if ($Url -match '\.gz($|[?#])') { return "gzip" } |
|
|
|
$ct = ($ContentTypeHeader ?? "").ToLowerInvariant() |
|
if ($ct -match "gzip") { return "gzip" } |
|
if ($ct -match "application/json") { return "json" } |
|
if ($ct -match "application/(rss|atom)\+xml") { return "rss" } |
|
if ($ct -match "application/xml|text/xml") { return "xml" } |
|
if ($ct -match "text/csv|application/csv") { return "csv" } |
|
if ($ct -match "text/html") { return "web" } |
|
|
|
$sample = ($Content ?? "") |
|
if ($sample.Length -gt 4096) { $sample = $sample.Substring(0, 4096) } |
|
$s = $sample.TrimStart() |
|
|
|
if ($s -match '^\s*<\?xml') { return "xml" } |
|
if ($s -match '^\s*<(rss|feed)\b') { return "rss" } |
|
if ($s -match '^\s*<!doctype html|^\s*<html\b') { return "web" } |
|
if ($s -match '^\s*[{[]') { return "json" } |
|
|
|
$lines = $sample -split "`r?`n" |
|
if ($lines.Count -ge 2 -and ($sample -match ',' -or $sample -match ';') -and -not ($sample -match '<(html|rss|feed|\?xml)\b')) { |
|
return "csv" |
|
} |
|
|
|
return "web" |
|
} |
|
|
|
function Decompress-GzipToString { |
|
param([byte[]]$GzBytes) |
|
|
|
$msIn = New-Object System.IO.MemoryStream(,$GzBytes) |
|
$gz = New-Object System.IO.Compression.GZipStream($msIn, [System.IO.Compression.CompressionMode]::Decompress) |
|
$msOut = New-Object System.IO.MemoryStream |
|
$gz.CopyTo($msOut) |
|
|
|
$gz.Dispose(); $msIn.Dispose() |
|
$bytesOut = $msOut.ToArray() |
|
$msOut.Dispose() |
|
|
|
try { return [System.Text.Encoding]::UTF8.GetString($bytesOut) } |
|
catch { return [System.Text.Encoding]::ASCII.GetString($bytesOut) } |
|
} |
|
|
|
function Test-Content { |
|
param([string]$Content, [string]$Type) |
|
|
|
switch ($Type) { |
|
"rss" { |
|
[xml]$x = $Content |
|
if ($x.rss -or $x.feed) { return } |
|
throw "XML parsed but not RSS/Atom" |
|
} |
|
"xml" { [xml]$null = $Content; return } |
|
"json" { |
|
$c = ($Content ?? "").TrimStart() |
|
if ($c -match "^\)\]\}',\s*") { $c = ($c -replace "^\)\]\}',\s*", "") } |
|
$null = $c | ConvertFrom-Json -ErrorAction Stop |
|
return |
|
} |
|
"csv" { |
|
$lines = ($Content ?? "") -split "`r?`n" |
|
if ($lines.Count -ge 2 -and (($Content ?? "") -match ',' -or ($Content ?? "") -match ';')) { return } |
|
throw "Doesn't look like CSV" |
|
} |
|
"web" { return } |
|
"api" { $null = (($Content ?? "").TrimStart() | ConvertFrom-Json -ErrorAction Stop); return } |
|
default { throw "Unknown type '$Type'" } |
|
} |
|
} |
|
|
|
function Extract-FeedLinksFromHtml { |
|
param([string]$Html, [string]$BaseUrl) |
|
|
|
$out = @() |
|
if (-not $Html) { return $out } |
|
|
|
try { |
|
$baseUri = [Uri]$BaseUrl |
|
$matches = [regex]::Matches($Html, '(?is)<link\s+[^>]*rel\s*=\s*["'']alternate["''][^>]*>') |
|
|
|
foreach ($m in $matches) { |
|
$tag = $m.Value |
|
$type = "" |
|
$href = "" |
|
|
|
$mt = [regex]::Match($tag, '(?is)\stype\s*=\s*["'']([^"'']+)["'']') |
|
if ($mt.Success) { $type = $mt.Groups[1].Value.ToLowerInvariant() } |
|
|
|
$mh = [regex]::Match($tag, '(?is)\shref\s*=\s*["'']([^"'']+)["'']') |
|
if ($mh.Success) { $href = $mh.Groups[1].Value } |
|
|
|
if (-not $href) { continue } |
|
if ($type -notmatch 'rss|atom|xml') { continue } |
|
|
|
$abs = $href |
|
if (-not ($href -match '^https?://')) { |
|
$abs = (New-Object Uri($baseUri, $href)).AbsoluteUri |
|
} |
|
|
|
$out += [PSCustomObject]@{ Url = $abs; Type = "rss"; Mime = $type } |
|
} |
|
} catch { } |
|
|
|
$out | Sort-Object Url -Unique |
|
} |
|
|
|
$script:__helpersLoaded = $true |
|
} |
|
|
|
# ---------------------------- |
|
# Per-item work |
|
# ---------------------------- |
|
$src = $_ |
|
$name = [string]$src.name |
|
$category = [string]$src.category |
|
$url = [string]$src.url |
|
$declared = [string]$src.type |
|
|
|
$declType = Normalize-Type $declared |
|
$testedAs = $declType |
|
|
|
$status = "FAIL" |
|
$httpStatus = $null |
|
$contentType = $null |
|
$finalUrl = $url |
|
$sniffedAs = $null |
|
$details = "" |
|
|
|
$suggestedUrl = "" |
|
$suggestedType = "" |
|
$recommendation = "" |
|
$suggestedPatch = "" |
|
|
|
function Invoke-With-Retry { |
|
param([string]$u, [hashtable]$Headers, [int]$TimeoutSec, [int]$MaxRedir, [int]$MaxRetries) |
|
$last = $null |
|
for ($attempt = 0; $attempt -le $MaxRetries; $attempt++) { |
|
try { |
|
return Invoke-WebRequest -Uri $u -Headers $Headers -TimeoutSec $TimeoutSec -MaximumRedirection $MaxRedir -SkipHttpErrorCheck -ErrorAction Stop |
|
} catch { |
|
$last = $_ |
|
if ($attempt -lt $MaxRetries) { |
|
Start-Sleep -Seconds ([Math]::Min(10, 2 * ($attempt + 1))) |
|
continue |
|
} |
|
} |
|
} |
|
throw $last |
|
} |
|
|
|
try { |
|
$resp = Invoke-With-Retry -u $url -Headers $using:headers -TimeoutSec $using:TimeoutSec -MaxRedir $using:maxRedir -MaxRetries $using:MaxRetries |
|
|
|
$httpStatus = [int]$resp.StatusCode |
|
$contentType = [string]$resp.Headers.'Content-Type' |
|
try { if ($resp.BaseResponse?.RequestMessage?.RequestUri) { $finalUrl = $resp.BaseResponse.RequestMessage.RequestUri.AbsoluteUri } } catch { } |
|
|
|
$content = "" |
|
$isGz = ($finalUrl -match '\.gz($|[?#])') -or (($contentType ?? "").ToLowerInvariant() -match 'gzip') |
|
|
|
if ($isGz) { |
|
$tmp = Join-Path $env:TEMP ("feedvalidator_" + [guid]::NewGuid().ToString("n") + ".gz") |
|
try { |
|
Invoke-WebRequest -Uri $finalUrl -Headers $using:headers -TimeoutSec $using:TimeoutSec -MaximumRedirection $using:maxRedir -SkipHttpErrorCheck -OutFile $tmp | Out-Null |
|
$bytes = [System.IO.File]::ReadAllBytes($tmp) |
|
$content = Decompress-GzipToString $bytes |
|
} finally { |
|
if (Test-Path $tmp) { Remove-Item $tmp -Force -ErrorAction SilentlyContinue } |
|
} |
|
} else { |
|
$content = [string]$resp.Content |
|
} |
|
|
|
if (-not $content) { $content = "" } |
|
|
|
$sniffedAs = Sniff-Type -Content $content -ContentTypeHeader $contentType -Url $finalUrl |
|
|
|
$try = New-Object System.Collections.Generic.List[string] |
|
$try.Add($testedAs) | Out-Null |
|
if ($sniffedAs -and $sniffedAs -ne $testedAs -and $sniffedAs -ne "gzip") { $try.Add($sniffedAs) | Out-Null } |
|
|
|
switch ($testedAs) { |
|
"rss" { foreach ($t in @("xml","json","web")) { if (-not $try.Contains($t)) { $try.Add($t) | Out-Null } } } |
|
"xml" { foreach ($t in @("rss","json","web")) { if (-not $try.Contains($t)) { $try.Add($t) | Out-Null } } } |
|
"json" { foreach ($t in @("xml","web")) { if (-not $try.Contains($t)) { $try.Add($t) | Out-Null } } } |
|
"api" { foreach ($t in @("json","web")) { if (-not $try.Contains($t)) { $try.Add($t) | Out-Null } } } |
|
"csv" { if (-not $try.Contains("web")) { $try.Add("web") | Out-Null } } |
|
default { if (-not $try.Contains("web")) { $try.Add("web") | Out-Null } } |
|
} |
|
|
|
$parsed = $false |
|
foreach ($t in $try) { |
|
try { Test-Content -Content $content -Type $t; $testedAs = $t; $parsed = $true; break } catch { } |
|
} |
|
|
|
if ($httpStatus -ge 200 -and $httpStatus -lt 400 -and $parsed) { |
|
$status = "OK" |
|
} |
|
elseif (($httpStatus -eq 401 -or $httpStatus -eq 403) -and ($declType -in @("web","api","json"))) { |
|
$status = "WARN"; $details = "HTTP $httpStatus (auth/WAF likely)" |
|
$recommendation = "May require auth/API key or is blocking clients." |
|
} |
|
elseif ($httpStatus -eq 429) { |
|
$status = "WARN"; $details = "HTTP 429 (rate limited)" |
|
$recommendation = "Lower throttle / increase backoff or use an API key." |
|
} |
|
elseif ($httpStatus -eq 400 -and $declType -eq "api") { |
|
$status = "WARN"; $details = "HTTP 400 (API likely needs query parameters)" |
|
$recommendation = "Add required params (limit/resultsPerPage/date window/apiKey)." |
|
} |
|
else { |
|
$status = "FAIL"; $details = "HTTP $httpStatus (parse failed: $($try -join ', '))" |
|
} |
|
|
|
if ($using:ProbeHtmlForFeeds -and ($testedAs -eq "web" -or $sniffedAs -eq "web")) { |
|
$links = Extract-FeedLinksFromHtml -Html $content -BaseUrl $finalUrl |
|
if ($links -and $links.Count -ge 1) { |
|
$best = $links | Select-Object -First 1 |
|
$suggestedUrl = $best.Url |
|
$suggestedType = $best.Type |
|
if ($status -eq "FAIL") { $status = "WARN" } |
|
if (-not $recommendation) { $recommendation = "HTML page advertises an RSS/Atom feed via <link rel='alternate'>." } |
|
} |
|
} |
|
|
|
if ($httpStatus -eq 404 -and $finalUrl -notmatch '/$' -and $finalUrl -notmatch '\.[a-zA-Z0-9]{1,5}($|[?#])') { |
|
try { |
|
$probe = $finalUrl + "/" |
|
$probeResp = Invoke-WebRequest -Uri $probe -Headers $using:headers -TimeoutSec $using:TimeoutSec -MaximumRedirection $using:maxRedir -SkipHttpErrorCheck -ErrorAction Stop |
|
if ([int]$probeResp.StatusCode -ge 200 -and [int]$probeResp.StatusCode -lt 400) { |
|
$suggestedUrl = $probe |
|
if (-not $suggestedType) { $suggestedType = $declType } |
|
if ($status -eq "FAIL") { $status = "WARN" } |
|
$recommendation = ($recommendation + " 404 fixed by adding trailing slash.").Trim() |
|
} |
|
} catch { } |
|
} |
|
|
|
if ($parsed -and $declType -ne $testedAs -and -not $suggestedType) { |
|
$suggestedType = $testedAs |
|
if (-not $recommendation) { $recommendation = "Declared type '$declType' didn't match content; validated as '$testedAs'." } |
|
if ($status -eq "OK") { $status = "WARN" } |
|
} |
|
|
|
} catch { |
|
$status = "FAIL" |
|
$details = $_.Exception.Message |
|
if (-not $recommendation) { |
|
if ($details -match "timed out|Timeout") { $recommendation = "Increase -TimeoutSec, reduce -ThrottleLimit, or retry from a different network." } |
|
elseif ($details -match "No such host|NameResolution|resolve") { $recommendation = "DNS/hostname issue. Check URL spelling or local DNS policy." } |
|
else { $recommendation = "Open in browser to confirm it still exists." } |
|
} |
|
} |
|
|
|
if ($suggestedUrl -or $suggestedType) { |
|
$newUrl = if ($suggestedUrl) { $suggestedUrl } else { $finalUrl } |
|
$newType = if ($suggestedType) { $suggestedType } else { $declType } |
|
$suggestedPatch = "Set type='$newType' url='$newUrl'" |
|
} |
|
|
|
[PSCustomObject]@{ |
|
Name = $name |
|
Category = $category |
|
URL = $url |
|
FinalUrl = $finalUrl |
|
DeclaredType = $declType |
|
TestedAs = $testedAs |
|
SniffedAs = $sniffedAs |
|
HttpStatus = $httpStatus |
|
ContentType = $contentType |
|
Status = $status |
|
Details = $details |
|
SuggestedType = $suggestedType |
|
SuggestedUrl = $suggestedUrl |
|
Recommendation = $recommendation |
|
SuggestedPatch = $suggestedPatch |
|
} |
|
|
|
} -ThrottleLimit $ThrottleLimit | |
|
ForEach-Object { |
|
# This consumer runs in the MAIN runspace and executes as each result arrives. |
|
$r = $_ |
|
$results.Add($r) | Out-Null |
|
$Completed++ |
|
|
|
switch ($r.Status) { |
|
"OK" { $okCount++ } |
|
"WARN" { $warnCount++ } |
|
default { $failCount++ } |
|
} |
|
|
|
if ($ShowLiveList) { |
|
$c = switch ($r.Status) { "OK" { "Green" } "WARN" { "Yellow" } default { "Red" } } |
|
$short = "{0,-4} [{1,-4}] {2}" -f $r.Status, ($r.TestedAs ?? ""), $r.Name |
|
Write-Host $short -ForegroundColor $c |
|
} |
|
|
|
if ($sw.ElapsedMilliseconds - $lastUiTick -ge 250) { |
|
$pct = [math]::Round(($Completed / [double]$total) * 100, 1) |
|
Write-Progress -Activity "Validating sources" -Status "$Completed / $total (OK=$okCount WARN=$warnCount FAIL=$failCount)" -PercentComplete $pct |
|
$lastUiTick = $sw.ElapsedMilliseconds |
|
} |
|
} |
|
|
|
Write-Progress -Activity "Validating sources" -Completed |
|
$sw.Stop() |
|
|
|
# ========================================================== |
|
# Final outputs |
|
# ========================================================== |
|
$baseDir = Split-Path $inputPath |
|
$outCsv = Join-Path $baseDir "feed_validation_report.csv" |
|
$outJson = Join-Path $baseDir "feed_list_fixed.json" |
|
|
|
$final = $results | Sort-Object @{Expression="Status"; Descending=$false}, @{Expression="Name"; Descending=$false} |
|
|
|
$final | Export-Csv -Path $outCsv -NoTypeInformation -Encoding UTF8 |
|
Write-Host "`nReport written to: $outCsv" -ForegroundColor Cyan |
|
|
|
if ($WriteFixedJson) { |
|
$fixed = [ordered]@{} |
|
foreach ($p in $data.PSObject.Properties) { |
|
if ($p.Name -ne "sources") { $fixed[$p.Name] = $p.Value } |
|
} |
|
|
|
$srcs = @() |
|
foreach ($s in $data.sources) { |
|
$m = $final | Where-Object { $_.Name -eq $s.name -and $_.URL -eq $s.url } | Select-Object -First 1 |
|
$new = [ordered]@{} |
|
foreach ($p in $s.PSObject.Properties) { $new[$p.Name] = $p.Value } |
|
if ($m) { |
|
if ($m.SuggestedType) { $new["type"] = $m.SuggestedType } |
|
if ($m.SuggestedUrl) { $new["url"] = $m.SuggestedUrl } |
|
} |
|
$srcs += [PSCustomObject]$new |
|
} |
|
|
|
$fixed["sources"] = $srcs |
|
($fixed | ConvertTo-Json -Depth 50) | Set-Content -Path $outJson -Encoding UTF8 |
|
Write-Host "Fixed JSON written to: $outJson" -ForegroundColor Cyan |
|
} |
|
|
|
Write-Host "`n=== Summary ===" -ForegroundColor Cyan |
|
Write-Host ("OK: {0}" -f $okCount) -ForegroundColor Green |
|
Write-Host ("WARN: {0}" -f $warnCount) -ForegroundColor Yellow |
|
Write-Host ("FAIL: {0}" -f $failCount) -ForegroundColor Red |
|
Write-Host ("Time: {0:n1}s" -f $sw.Elapsed.TotalSeconds) -ForegroundColor DarkGray |
|
|
|
Write-Host "`nTop failures:" -ForegroundColor Cyan |
|
$final | Where-Object Status -eq "FAIL" | Select-Object -First 15 Name,HttpStatus,DeclaredType,Details | Format-Table -AutoSize |
|
|
|
Write-Host "`nDone.`n" -ForegroundColor Cyan |