Last active
May 3, 2022 01:31
-
-
Save CCRcmcpe/f88a5b93dcb918a13f4b90d3b4cd6c02 to your computer and use it in GitHub Desktop.
哔哩哔哩 爬取UP主专栏图片
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| param ( | |
| [ValidateRange(1, 999999999)] | |
| [int] | |
| $userId, | |
| [ValidateNotNullOrEmpty()] | |
| [string] | |
| $saveDir | |
| ) | |
| #$sessdata = '' | |
| $PSStyle.Progress.UseOSCIndicator = $true | |
| $PSStyle.Progress.View = 'Classic' | |
| $ErrorActionPreference = 'Stop' | |
| if (!(Test-Path $saveDir)) { | |
| $null = mkdir $saveDir | |
| } | |
| $page = 0 | |
| $totalPagesCount = 1 | |
| $articles = $() | |
| do { | |
| $page++ | |
| $response = Invoke-RestMethod -Method Get ` | |
| -Uri 'https://api.bilibili.com/x/space/article' ` | |
| -Body @{ mid = $userId; pn = $page } | |
| #-Headers @{ SESSDATA = $sessdata } | |
| $articles += $response.data.articles | |
| $totalPagesCount = [int]($response.data.count / $response.data.ps) | |
| Write-Progress '爬取专栏' -Id 0 -Status "进行中 [$page/$totalPagesCount] 页 / 专栏 [$($articles.Count)] 个 ->" -PercentComplete ($page / $totalPagesCount * 100) | |
| } while ($page -lt $totalPagesCount) | |
| Write-Progress '爬取专栏' -Id 0 -Status '已完成' | |
| $imageRegex = [regex]'<figure.*?data-src="(?<imageLink>//i0.hdslb.com/bfs/article/.*?)".*?<figcaption.*?>(?<imageDesc>.*?)</figcaption>.*?</figure>' | |
| function NormalizePath([string] $path) { | |
| $sb = [System.Text.StringBuilder]::new() | |
| foreach ($char in $path.ToCharArray()) { | |
| if ($char -notmatch '[\x00-\x1F\x7F"\*\/:<>\?\\\|]') { | |
| $null = $sb.Append($char) | |
| } | |
| else { | |
| $null = $sb.AppendFormat("%{0:X2}", [int]$char) | |
| } | |
| } | |
| $sb.ToString() | |
| } | |
| $articleCount = 0; | |
| $addedCount = 0; | |
| $ignoredCount = 0; | |
| foreach ($article in $articles) { | |
| $articleCount++ | |
| Write-Progress '爬取图片' -Id 1 -Status "已处理 [$articleCount/$($articles.Count)] 个专栏 ->" -PercentComplete ($articleCount / $articles.Count * 100) | |
| Write-Progress '添加下载' -Id 2 -ParentId 1 -Status "添加 $addedCount 个 | 忽略 $ignoredCount 个" | |
| $articleSaveDir = "$saveDir\$(NormalizePath $article.title)" | |
| if (Test-Path $articleSaveDir) { | |
| $ignoredCount++ | |
| continue | |
| } | |
| else { | |
| $addedCount++ | |
| $null = mkdir $articleSaveDir | |
| } | |
| do { | |
| $failed = $false | |
| $job = Start-Job { | |
| $ProgressPreference = 'SilentlyContinue'; | |
| Invoke-WebRequest "https://www.bilibili.com/read/cv$input/" | |
| } -InputObject $article.id | |
| $null = Wait-Job $job -Timeout 5 | |
| if ($job.State -ne 'Completed') { | |
| Stop-Job $job | |
| $failed = $true | |
| } | |
| } until (!$failed) | |
| $result = Receive-Job $job | |
| $imageCount = 0 | |
| foreach ($match in $imageRegex.Matches($result.Content)) { | |
| $imageCount++ | |
| $imageLink = "https:$($match.Groups['imageLink'].Value)" | |
| $imageDesc = $match.Groups['imageDesc'].Value | |
| $index = $imageDesc.IndexOf('<br') | |
| if ($index -ne -1) { | |
| $imageDesc = $imageDesc.Remove($index) | |
| } | |
| if ($imageDesc -eq '') { | |
| $imageDesc = $imageCount.ToString() | |
| } | |
| $fileName = (NormalizePath $imageDesc) + '.jpg' | |
| $ProgressPreference = 'SilentlyContinue' | |
| $null = Invoke-WebRequest 'http://127.0.0.1:6800/jsonrpc' ` | |
| -Method Post ` | |
| -Headers @{ | |
| "Content-Type" = "application/json; charset=utf-8" | |
| } ` | |
| -Body ([System.Text.Encoding]::UTF8.GetBytes((ConvertTo-Json @{ | |
| jsonrpc = '2.0'; | |
| method = 'aria2.addUri'; | |
| id = 'wdnmd'; | |
| params = @($imageLink), | |
| @{dir = $articleSaveDir; out = $fileName } | |
| }))) | |
| $ProgressPreference = 'Continue' | |
| } | |
| } | |
| Write-Progress '爬取图片' -Id 1 -Status "已完成" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment