AronNovak/find_dupicates.sh

## find_dupicates.sh
#!/bin/bash

PATCH_FILE="${1:-full_diff.patch}"

# Extract filenames from the patch
grep -a "^diff --git" "$PATCH_FILE" | sed 's|diff --git a/.* b/||' | sort -u > /tmp/patch_files.txt

echo "=== Duplicate File Report ==="
echo "Patch: $PATCH_FILE"
echo "Total files in patch: $(wc -l < /tmp/patch_files.txt)"
echo ""

# Hash each file that exists
> /tmp/file_hashes.txt
while read -r filepath; do
    if [[ -f "$filepath" ]]; then
        hash=$(md5sum "$filepath" 2>/dev/null | cut -d' ' -f1)
        size=$(stat -c%s "$filepath" 2>/dev/null || echo 0)
        echo "$hash|$size|$filepath" >> /tmp/file_hashes.txt
    fi
done < /tmp/patch_files.txt

# Group by hash and show duplicates
echo "=== Duplicate Groups (sorted by wasted space) ==="
echo ""

sort -t'|' -k1,1 /tmp/file_hashes.txt | awk -F'|' '
{
    hash=$1; size=$2; file=$3
    if(hash == prev_hash) {
        files[hash] = files[hash] "\n    " file
        count[hash]++
    } else {
        files[hash] = file
        count[hash] = 1
        sizes[hash] = size
    }
    prev_hash = hash
}
END {
    for(h in count) {
        if(count[h] > 1) {
            wasted = sizes[h] * (count[h] - 1)
            printf "%d|%d|%d|%s|%s\n", wasted, count[h], sizes[h], h, files[h]
        }
    }
}' | sort -t'|' -k1,1 -rn | while IFS='|' read -r wasted count size hash files; do
    wasted_kb=$((wasted / 1024))
    size_kb=$((size / 1024))
    echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
    echo "Hash: $hash"
    echo "Copies: $count | File size: ${size_kb}KB (${size} bytes) | Wasted: ${wasted_kb}KB"
    echo "Files:"
    echo -e "    $files"
    echo ""
done

# Summary
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "=== SUMMARY ==="
total_wasted=$(sort -t'|' -k1,1 /tmp/file_hashes.txt | awk -F'|' '
{
    hash=$1; size=$2
    if(hash == prev_hash) { count[hash]++ }
    else { count[hash] = 1; sizes[hash] = size }
    prev_hash = hash
}
END {
    for(h in count) if(count[h] > 1) sum += sizes[h] * (count[h] - 1)
    print sum
}')
dup_groups=$(sort -t'|' -k1,1 /tmp/file_hashes.txt | cut -d'|' -f1 | uniq -d | wc -l)
echo "Duplicate groups: $dup_groups"
echo "Total wasted space: $((total_wasted / 1024)) KB (~$((total_wasted / 1024 / 1024)) MB)"
	#!/bin/bash

	PATCH_FILE="${1:-full_diff.patch}"

	# Extract filenames from the patch
	grep -a "^diff --git" "$PATCH_FILE" \| sed 's\|diff --git a/.* b/\|\|' \| sort -u > /tmp/patch_files.txt

	echo "=== Duplicate File Report ==="
	echo "Patch: $PATCH_FILE"
	echo "Total files in patch: $(wc -l < /tmp/patch_files.txt)"
	echo ""

	# Hash each file that exists
	> /tmp/file_hashes.txt
	while read -r filepath; do
	if [[ -f "$filepath" ]]; then
	hash=$(md5sum "$filepath" 2>/dev/null \| cut -d' ' -f1)
	size=$(stat -c%s "$filepath" 2>/dev/null \|\| echo 0)
	echo "$hash\|$size\|$filepath" >> /tmp/file_hashes.txt
	fi
	done < /tmp/patch_files.txt

	# Group by hash and show duplicates
	echo "=== Duplicate Groups (sorted by wasted space) ==="
	echo ""

	sort -t'\|' -k1,1 /tmp/file_hashes.txt \| awk -F'\|' '
	{
	hash=$1; size=$2; file=$3
	if(hash == prev_hash) {
	files[hash] = files[hash] "\n " file
	count[hash]++
	} else {
	files[hash] = file
	count[hash] = 1
	sizes[hash] = size
	}
	prev_hash = hash
	}
	END {
	for(h in count) {
	if(count[h] > 1) {
	wasted = sizes[h] * (count[h] - 1)
	printf "%d\|%d\|%d\|%s\|%s\n", wasted, count[h], sizes[h], h, files[h]
	}
	}
	}' \| sort -t'\|' -k1,1 -rn \| while IFS='\|' read -r wasted count size hash files; do
	wasted_kb=$((wasted / 1024))
	size_kb=$((size / 1024))
	echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
	echo "Hash: $hash"
	echo "Copies: $count \| File size: ${size_kb}KB (${size} bytes) \| Wasted: ${wasted_kb}KB"
	echo "Files:"
	echo -e " $files"
	echo ""
	done

	# Summary
	echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
	echo "=== SUMMARY ==="
	total_wasted=$(sort -t'\|' -k1,1 /tmp/file_hashes.txt \| awk -F'\|' '
	{
	hash=$1; size=$2
	if(hash == prev_hash) { count[hash]++ }
	else { count[hash] = 1; sizes[hash] = size }
	prev_hash = hash
	}
	END {
	for(h in count) if(count[h] > 1) sum += sizes[h] * (count[h] - 1)
	print sum
	}')
	dup_groups=$(sort -t'\|' -k1,1 /tmp/file_hashes.txt \| cut -d'\|' -f1 \| uniq -d \| wc -l)
	echo "Duplicate groups: $dup_groups"
	echo "Total wasted space: $((total_wasted / 1024)) KB (~$((total_wasted / 1024 / 1024)) MB)"
No results found