Skip to content

Instantly share code, notes, and snippets.

@AronNovak
Created January 16, 2026 08:00
Show Gist options
  • Select an option

  • Save AronNovak/918c294f8fb14e76bb4fa2d1eab38bb2 to your computer and use it in GitHub Desktop.

Select an option

Save AronNovak/918c294f8fb14e76bb4fa2d1eab38bb2 to your computer and use it in GitHub Desktop.
Checking for identical files within a patch file
#!/bin/bash
PATCH_FILE="${1:-full_diff.patch}"
# Extract filenames from the patch
grep -a "^diff --git" "$PATCH_FILE" | sed 's|diff --git a/.* b/||' | sort -u > /tmp/patch_files.txt
echo "=== Duplicate File Report ==="
echo "Patch: $PATCH_FILE"
echo "Total files in patch: $(wc -l < /tmp/patch_files.txt)"
echo ""
# Hash each file that exists
> /tmp/file_hashes.txt
while read -r filepath; do
if [[ -f "$filepath" ]]; then
hash=$(md5sum "$filepath" 2>/dev/null | cut -d' ' -f1)
size=$(stat -c%s "$filepath" 2>/dev/null || echo 0)
echo "$hash|$size|$filepath" >> /tmp/file_hashes.txt
fi
done < /tmp/patch_files.txt
# Group by hash and show duplicates
echo "=== Duplicate Groups (sorted by wasted space) ==="
echo ""
sort -t'|' -k1,1 /tmp/file_hashes.txt | awk -F'|' '
{
hash=$1; size=$2; file=$3
if(hash == prev_hash) {
files[hash] = files[hash] "\n " file
count[hash]++
} else {
files[hash] = file
count[hash] = 1
sizes[hash] = size
}
prev_hash = hash
}
END {
for(h in count) {
if(count[h] > 1) {
wasted = sizes[h] * (count[h] - 1)
printf "%d|%d|%d|%s|%s\n", wasted, count[h], sizes[h], h, files[h]
}
}
}' | sort -t'|' -k1,1 -rn | while IFS='|' read -r wasted count size hash files; do
wasted_kb=$((wasted / 1024))
size_kb=$((size / 1024))
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "Hash: $hash"
echo "Copies: $count | File size: ${size_kb}KB (${size} bytes) | Wasted: ${wasted_kb}KB"
echo "Files:"
echo -e " $files"
echo ""
done
# Summary
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "=== SUMMARY ==="
total_wasted=$(sort -t'|' -k1,1 /tmp/file_hashes.txt | awk -F'|' '
{
hash=$1; size=$2
if(hash == prev_hash) { count[hash]++ }
else { count[hash] = 1; sizes[hash] = size }
prev_hash = hash
}
END {
for(h in count) if(count[h] > 1) sum += sizes[h] * (count[h] - 1)
print sum
}')
dup_groups=$(sort -t'|' -k1,1 /tmp/file_hashes.txt | cut -d'|' -f1 | uniq -d | wc -l)
echo "Duplicate groups: $dup_groups"
echo "Total wasted space: $((total_wasted / 1024)) KB (~$((total_wasted / 1024 / 1024)) MB)"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment