Created
January 16, 2026 08:00
-
-
Save AronNovak/918c294f8fb14e76bb4fa2d1eab38bb2 to your computer and use it in GitHub Desktop.
Checking for identical files within a patch file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| PATCH_FILE="${1:-full_diff.patch}" | |
| # Extract filenames from the patch | |
| grep -a "^diff --git" "$PATCH_FILE" | sed 's|diff --git a/.* b/||' | sort -u > /tmp/patch_files.txt | |
| echo "=== Duplicate File Report ===" | |
| echo "Patch: $PATCH_FILE" | |
| echo "Total files in patch: $(wc -l < /tmp/patch_files.txt)" | |
| echo "" | |
| # Hash each file that exists | |
| > /tmp/file_hashes.txt | |
| while read -r filepath; do | |
| if [[ -f "$filepath" ]]; then | |
| hash=$(md5sum "$filepath" 2>/dev/null | cut -d' ' -f1) | |
| size=$(stat -c%s "$filepath" 2>/dev/null || echo 0) | |
| echo "$hash|$size|$filepath" >> /tmp/file_hashes.txt | |
| fi | |
| done < /tmp/patch_files.txt | |
| # Group by hash and show duplicates | |
| echo "=== Duplicate Groups (sorted by wasted space) ===" | |
| echo "" | |
| sort -t'|' -k1,1 /tmp/file_hashes.txt | awk -F'|' ' | |
| { | |
| hash=$1; size=$2; file=$3 | |
| if(hash == prev_hash) { | |
| files[hash] = files[hash] "\n " file | |
| count[hash]++ | |
| } else { | |
| files[hash] = file | |
| count[hash] = 1 | |
| sizes[hash] = size | |
| } | |
| prev_hash = hash | |
| } | |
| END { | |
| for(h in count) { | |
| if(count[h] > 1) { | |
| wasted = sizes[h] * (count[h] - 1) | |
| printf "%d|%d|%d|%s|%s\n", wasted, count[h], sizes[h], h, files[h] | |
| } | |
| } | |
| }' | sort -t'|' -k1,1 -rn | while IFS='|' read -r wasted count size hash files; do | |
| wasted_kb=$((wasted / 1024)) | |
| size_kb=$((size / 1024)) | |
| echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" | |
| echo "Hash: $hash" | |
| echo "Copies: $count | File size: ${size_kb}KB (${size} bytes) | Wasted: ${wasted_kb}KB" | |
| echo "Files:" | |
| echo -e " $files" | |
| echo "" | |
| done | |
| # Summary | |
| echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" | |
| echo "=== SUMMARY ===" | |
| total_wasted=$(sort -t'|' -k1,1 /tmp/file_hashes.txt | awk -F'|' ' | |
| { | |
| hash=$1; size=$2 | |
| if(hash == prev_hash) { count[hash]++ } | |
| else { count[hash] = 1; sizes[hash] = size } | |
| prev_hash = hash | |
| } | |
| END { | |
| for(h in count) if(count[h] > 1) sum += sizes[h] * (count[h] - 1) | |
| print sum | |
| }') | |
| dup_groups=$(sort -t'|' -k1,1 /tmp/file_hashes.txt | cut -d'|' -f1 | uniq -d | wc -l) | |
| echo "Duplicate groups: $dup_groups" | |
| echo "Total wasted space: $((total_wasted / 1024)) KB (~$((total_wasted / 1024 / 1024)) MB)" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment