Last active
May 9, 2025 18:24
-
-
Save jasonadsit/b6cc547f337ae44ef3b02f0e8f6b7fc9 to your computer and use it in GitHub Desktop.
Some code to cull files for eDiscovery π€·ββοΈ
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| $PathsToInventory = 'D:\','E:\','F:\','H:\','I:\','J:\' | |
| <# | |
| It's assumed that the sources are drive letters. | |
| It's also assumed that all files have been copied to the same | |
| root directory under a folders corresponding to their drive letters. | |
| The culling should have been done first but here we are :-) | |
| This may need to be refactored if UNC paths are used. | |
| Set-Location/cd into the working directory before you begin. | |
| #> | |
| ########## DON'T EDIT BELOW THIS ########## | |
| $FileInventory = ".\file-inventory.txt" | |
| $Params = @{ | |
| Path = $PathsToInventory | |
| File = $true | |
| Recurse = $true | |
| Force = $true | |
| ErrorAction = 'SilentlyContinue' | |
| } | |
| Get-ChildItem @Params | ForEach-Object { | |
| $LastWriteTime = $_.LastWriteTime.GetDateTimeFormats('s') | |
| $MD5 = ($_ | Get-FileHash -Algorithm MD5 -ErrorAction SilentlyContinue).Hash | |
| $Length = $_.Length | |
| $FullName = $_.FullName | |
| "$LastWriteTime|$MD5|$Length|$FullName" | |
| Clear-Variable -Name LastWriteTime,MD5,Length,FullName | |
| } | Out-File -FilePath $FileInventory | |
| [gc]::Collect() | |
| ### | |
| $Params = @{ | |
| Path = $FileInventory | |
| Delimiter = '|' | |
| Header = 'LastWriteTime','MD5','Length','FullName' | |
| } | |
| $AllTheFiles = Import-Csv @Params | | |
| Select-Object -Property LastWriteTime, | |
| MD5, | |
| @{n='Length';e={[long]$_.Length}}, | |
| FullName | |
| $AllTheFiles.ForEach({$_.FullName = ".\$($_.FullName -replace ':')"}) | |
| $UniqueFiles = $AllTheFiles | | |
| Group-Object -Property MD5 | | |
| ForEach-Object { $_.Group | Select-Object -First 1 } | |
| # 10GB = 10737418240 | 5GB = 5368709120 | 2GB = 2147483648 | 1GB = 1073741824 | |
| $LargeFileSizeLimit = [long]5368709120 | |
| $IncludedLargeFilesRegEx = '\.pst$|\.ost$' | |
| $StringBuilder = New-Object -TypeName System.Text.StringBuilder | |
| [void]$StringBuilder.Append('\.exe$|\.dll$|\.img$|\.iso$|\.bak$|\.shp$|') | |
| [void]$StringBuilder.Append('\.msp$|\.lnk$|\.msu$|\.cab$|\.mdb$|\.adf$|') | |
| [void]$StringBuilder.Append('\.bin$|\.ige$|\.rde$|\.bin$|\.mdf$|\.ldf$|') | |
| [void]$StringBuilder.Append('\.dbf$|\.mui$|\.shx$|\.atx$|\.sbn$|robocopy|') | |
| [void]$StringBuilder.Append('\.manifest$|\.gdbtablx$|\.gdbindexes$|\.sys$|') | |
| [void]$StringBuilder.Append('\\WinSxS\\|\.gdbtable$|\\SMSSIG\$|\.hpp$|') | |
| [void]$StringBuilder.Append('\\Sophos\\|\\Windows\\servicing\\|\.rp$|\.htm$') | |
| $ExcludedFilesRegEx = $StringBuilder.ToString() | |
| $InScopeFiles = $UniqueFiles.Where({ | |
| ( | |
| $_.Length -le $LargeFileSizeLimit -and | |
| $_.FullName -notmatch $ExcludedFilesRegEx | |
| ) -or | |
| ( | |
| $_.Length -ge $LargeFileSizeLimit -and | |
| $_.FullName -match $IncludedLargeFilesRegEx -and | |
| $_.FullName -notmatch $ExcludedFilesRegEx | |
| ) | |
| }) | |
| $UniqueFiles | Export-Csv -Path .\unique-files.csv | |
| Clear-Variable -Name UniqueFiles | |
| [gc]::Collect() | |
| $InScopeFiles = $InScopeFiles.Where({$_.MD5}) | |
| $InScopeFiles = $InScopeFiles | Sort-Object -Property Length -Descending | |
| $InScopeFiles | Export-Csv -Path '.\in-scope-files.csv' | |
| $InScopeHashes = $InScopeFiles.MD5 | |
| $InScopeHashes | Out-File -FilePath '.\in-scope-hashes.txt' | |
| $InScopeHashesHashSet = [System.Collections.Generic.HashSet[string]]::new() | |
| $InScopeHashes.ForEach({[void]$InScopeHashesHashSet.Add($_)}) | |
| $InScopePaths = $InScopeFiles.FullName | |
| $InScopePaths | Out-File -FilePath '.\in-scope-paths.txt' | |
| $InScopePathsHashSet = [System.Collections.Generic.HashSet[string]]::new() | |
| $InScopePaths.ForEach({[void]$InScopePathsHashSet.Add($_)}) | |
| $OutOfScopeFiles = $AllTheFiles.Where({ | |
| -not $InScopeHashesHashSet.Contains($_.MD5) -and | |
| -not $InScopePathsHashSet.Contains($_.FullName) | |
| }).FullName | |
| Clear-Variable -Name AllTheFiles,InScopeFiles,InScopeHashes | |
| [gc]::Collect() | |
| $OutOfScopeFiles | Out-File -FilePath '.\out-of-scope-files.txt' | |
| ### Only if you're REALLY sure ### | |
| Remove-Item -Path $OutOfScopeFiles -Force -ErrorAction SilentlyContinue | |
| ################################## | |
| Clear-Variable -Name OutOfScopeFiles,InScopeHashesHashSet,InScopePathsHashSet | |
| [gc]::Collect() | |
| # That's all folks |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment