Skip to content

Instantly share code, notes, and snippets.

@jasonadsit
Last active May 9, 2025 18:24
Show Gist options
  • Select an option

  • Save jasonadsit/b6cc547f337ae44ef3b02f0e8f6b7fc9 to your computer and use it in GitHub Desktop.

Select an option

Save jasonadsit/b6cc547f337ae44ef3b02f0e8f6b7fc9 to your computer and use it in GitHub Desktop.
Some code to cull files for eDiscovery πŸ€·β€β™‚οΈ
$PathsToInventory = 'D:\','E:\','F:\','H:\','I:\','J:\'
<#
It's assumed that the sources are drive letters.
It's also assumed that all files have been copied to the same
root directory under a folders corresponding to their drive letters.
The culling should have been done first but here we are :-)
This may need to be refactored if UNC paths are used.
Set-Location/cd into the working directory before you begin.
#>
########## DON'T EDIT BELOW THIS ##########
$FileInventory = ".\file-inventory.txt"
$Params = @{
Path = $PathsToInventory
File = $true
Recurse = $true
Force = $true
ErrorAction = 'SilentlyContinue'
}
Get-ChildItem @Params | ForEach-Object {
$LastWriteTime = $_.LastWriteTime.GetDateTimeFormats('s')
$MD5 = ($_ | Get-FileHash -Algorithm MD5 -ErrorAction SilentlyContinue).Hash
$Length = $_.Length
$FullName = $_.FullName
"$LastWriteTime|$MD5|$Length|$FullName"
Clear-Variable -Name LastWriteTime,MD5,Length,FullName
} | Out-File -FilePath $FileInventory
[gc]::Collect()
###
$Params = @{
Path = $FileInventory
Delimiter = '|'
Header = 'LastWriteTime','MD5','Length','FullName'
}
$AllTheFiles = Import-Csv @Params |
Select-Object -Property LastWriteTime,
MD5,
@{n='Length';e={[long]$_.Length}},
FullName
$AllTheFiles.ForEach({$_.FullName = ".\$($_.FullName -replace ':')"})
$UniqueFiles = $AllTheFiles |
Group-Object -Property MD5 |
ForEach-Object { $_.Group | Select-Object -First 1 }
# 10GB = 10737418240 | 5GB = 5368709120 | 2GB = 2147483648 | 1GB = 1073741824
$LargeFileSizeLimit = [long]5368709120
$IncludedLargeFilesRegEx = '\.pst$|\.ost$'
$StringBuilder = New-Object -TypeName System.Text.StringBuilder
[void]$StringBuilder.Append('\.exe$|\.dll$|\.img$|\.iso$|\.bak$|\.shp$|')
[void]$StringBuilder.Append('\.msp$|\.lnk$|\.msu$|\.cab$|\.mdb$|\.adf$|')
[void]$StringBuilder.Append('\.bin$|\.ige$|\.rde$|\.bin$|\.mdf$|\.ldf$|')
[void]$StringBuilder.Append('\.dbf$|\.mui$|\.shx$|\.atx$|\.sbn$|robocopy|')
[void]$StringBuilder.Append('\.manifest$|\.gdbtablx$|\.gdbindexes$|\.sys$|')
[void]$StringBuilder.Append('\\WinSxS\\|\.gdbtable$|\\SMSSIG\$|\.hpp$|')
[void]$StringBuilder.Append('\\Sophos\\|\\Windows\\servicing\\|\.rp$|\.htm$')
$ExcludedFilesRegEx = $StringBuilder.ToString()
$InScopeFiles = $UniqueFiles.Where({
(
$_.Length -le $LargeFileSizeLimit -and
$_.FullName -notmatch $ExcludedFilesRegEx
) -or
(
$_.Length -ge $LargeFileSizeLimit -and
$_.FullName -match $IncludedLargeFilesRegEx -and
$_.FullName -notmatch $ExcludedFilesRegEx
)
})
$UniqueFiles | Export-Csv -Path .\unique-files.csv
Clear-Variable -Name UniqueFiles
[gc]::Collect()
$InScopeFiles = $InScopeFiles.Where({$_.MD5})
$InScopeFiles = $InScopeFiles | Sort-Object -Property Length -Descending
$InScopeFiles | Export-Csv -Path '.\in-scope-files.csv'
$InScopeHashes = $InScopeFiles.MD5
$InScopeHashes | Out-File -FilePath '.\in-scope-hashes.txt'
$InScopeHashesHashSet = [System.Collections.Generic.HashSet[string]]::new()
$InScopeHashes.ForEach({[void]$InScopeHashesHashSet.Add($_)})
$InScopePaths = $InScopeFiles.FullName
$InScopePaths | Out-File -FilePath '.\in-scope-paths.txt'
$InScopePathsHashSet = [System.Collections.Generic.HashSet[string]]::new()
$InScopePaths.ForEach({[void]$InScopePathsHashSet.Add($_)})
$OutOfScopeFiles = $AllTheFiles.Where({
-not $InScopeHashesHashSet.Contains($_.MD5) -and
-not $InScopePathsHashSet.Contains($_.FullName)
}).FullName
Clear-Variable -Name AllTheFiles,InScopeFiles,InScopeHashes
[gc]::Collect()
$OutOfScopeFiles | Out-File -FilePath '.\out-of-scope-files.txt'
### Only if you're REALLY sure ###
Remove-Item -Path $OutOfScopeFiles -Force -ErrorAction SilentlyContinue
##################################
Clear-Variable -Name OutOfScopeFiles,InScopeHashesHashSet,InScopePathsHashSet
[gc]::Collect()
# That's all folks
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment