Skip to content

Instantly share code, notes, and snippets.

@r4d10n
Created November 11, 2025 05:11
Show Gist options
  • Select an option

  • Save r4d10n/26dc351295a1a3a1a49918eb234177ba to your computer and use it in GitHub Desktop.

Select an option

Save r4d10n/26dc351295a1a3a1a49918eb234177ba to your computer and use it in GitHub Desktop.
Script to split/rejoin a large file into/from multiparts with MD5 checksum checks and sync upload to a Rclone compatible drive (S3, Google Drive, etc.) to beat Request Rate Limiting
# Script to split/rejoin a large file into/from multiparts with MD5 checksum checks
# Sync upload to a Rclone compatible drive (Google Drive, S3, etc.) to beat Request Rate Limiting
!/bin/bash
# Configuration
REMOTE_PATH="drv:path" # use rclone configure to setup
NUM_PARTS=20
DELAY_BETWEEN_UPLOADS=15 # seconds
UPLOAD_RETRIES=3
# Colors for output
GREEN='\033[0;32m'
RED='\033[0;31m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Function to print colored messages
print_info() {
echo -e "${GREEN}[INFO]${NC} $1"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
print_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
print_section() {
echo -e "${BLUE}[====== $1 ======]${NC}"
}
# Function to calculate MD5
calculate_md5() {
local file=$1
md5sum "$file" | awk '{print $1}'
}
# Function to create MD5 for parts
create_parts_md5() {
local temp_dir=$1
local base_name=$2
local md5_file="$temp_dir/${base_name}.md5sums"
print_info "Creating MD5 checksums for all parts..."
> "$md5_file" # Clear/create file
for part in "$temp_dir/${base_name}.part_"*; do
if [ -f "$part" ]; then
local part_name=$(basename "$part")
local part_md5=$(calculate_md5 "$part")
echo "$part_md5 $part_name" >> "$md5_file"
print_info " $part_name: $part_md5"
fi
done
echo "$md5_file"
}
# Upload function with retry logic
upload_file() {
local file=$1
local attempt=1
while [ $attempt -le $UPLOAD_RETRIES ]; do
print_info "Uploading $(basename $file) (attempt $attempt/$UPLOAD_RETRIES)..."
if rclone copy "$file" "$REMOTE_PATH" \
--transfers 1 \
--checkers 1 \
--retries 3 \
--low-level-retries 5 \
--timeout 2h \
--retries-sleep 10s \
-P; then
print_info "✓ Successfully uploaded $(basename $file)"
return 0
else
print_warning "✗ Upload failed for $(basename $file), attempt $attempt/$UPLOAD_RETRIES"
attempt=$((attempt + 1))
if [ $attempt -le $UPLOAD_RETRIES ]; then
sleep $((DELAY_BETWEEN_UPLOADS * 2))
fi
fi
done
print_error "Failed to upload $(basename $file) after $UPLOAD_RETRIES attempts"
return 1
}
# Process a single file
process_file() {
local source_file=$1
local file_index=$2
local total_files=$3
print_section "Processing file $file_index/$total_files: $source_file"
# Check if source file exists
if [ ! -f "$source_file" ]; then
print_error "Source file '$source_file' not found!"
return 1
fi
# Get file info
local file_size=$(stat -c%s "$source_file" 2>/dev/null || stat -f%z "$source_file")
local base_name=$(basename "$source_file")
print_info "File size: $(numfmt --to=iec-i --suffix=B $file_size)"
# Create temporary directory for this file
local temp_dir="./split_parts_${base_name}_$$"
mkdir -p "$temp_dir"
print_info "Created temporary directory: $temp_dir"
# Calculate MD5 checksum of original file
print_info "Calculating MD5 checksum of original file..."
local original_md5=$(calculate_md5 "$source_file")
echo "$original_md5" > "$temp_dir/${base_name}.original.md5"
print_info "Original MD5: $original_md5"
# Split the file
print_info "Splitting file into $NUM_PARTS parts..."
cd "$temp_dir"
split -n $NUM_PARTS "../$source_file" "${base_name}.part_"
cd ..
# List parts
local parts=($temp_dir/${base_name}.part_*)
print_info "Created ${#parts[@]} parts:"
for part in "${parts[@]}"; do
local part_size=$(stat -c%s "$part" 2>/dev/null || stat -f%z "$part")
echo " - $(basename $part): $(numfmt --to=iec-i --suffix=B $part_size)"
done
# Create MD5 checksums for all parts
local md5sums_file=$(create_parts_md5 "$temp_dir" "$base_name")
# Create metadata file
local metadata_file="$temp_dir/${base_name}.metadata.txt"
cat > "$metadata_file" << EOF
Original File: $source_file
Original Size: $file_size bytes
Original MD5: $original_md5
Number of Parts: $NUM_PARTS
Split Date: $(date)
Parts:
EOF
for part in "${parts[@]}"; do
echo " $(basename $part)" >> "$metadata_file"
done
print_info "Created metadata file: $metadata_file"
# Create rejoin and verify script
local rejoin_script="$temp_dir/${base_name}.rejoin.sh"
cat > "$rejoin_script" << 'REJOIN_EOF'
#!/bin/bash
# Rejoin and Verify Script
set -e
GREEN='\033[0;32m'
RED='\033[0;31m'
YELLOW='\033[1;33m'
NC='\033[0m'
print_info() {
echo -e "${GREEN}[INFO]${NC} $1"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
print_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
# Find the base name from parts
PARTS=(*.part_*)
if [ ${#PARTS[@]} -eq 0 ]; then
print_error "No part files found in current directory!"
exit 1
fi
# Extract base name (remove .part_xx suffix)
BASE_NAME="${PARTS[0]%.part_*}"
OUTPUT_FILE="$BASE_NAME"
print_info "Base name detected: $BASE_NAME"
print_info "Output file will be: $OUTPUT_FILE"
# Check if all required files exist
if [ ! -f "${BASE_NAME}.md5sums" ]; then
print_error "MD5 checksums file not found: ${BASE_NAME}.md5sums"
exit 1
fi
if [ ! -f "${BASE_NAME}.original.md5" ]; then
print_warning "Original MD5 file not found: ${BASE_NAME}.original.md5"
print_warning "Will skip original file verification"
SKIP_ORIGINAL=1
else
SKIP_ORIGINAL=0
fi
# Verify individual parts
print_info "Step 1: Verifying MD5 checksums of individual parts..."
ALL_PARTS_VALID=true
while IFS= read -r line; do
EXPECTED_MD5=$(echo "$line" | awk '{print $1}')
PART_NAME=$(echo "$line" | awk '{print $2}')
if [ ! -f "$PART_NAME" ]; then
print_error "Part file missing: $PART_NAME"
ALL_PARTS_VALID=false
continue
fi
ACTUAL_MD5=$(md5sum "$PART_NAME" | awk '{print $1}')
if [ "$EXPECTED_MD5" == "$ACTUAL_MD5" ]; then
print_info "✓ $PART_NAME: MD5 verified"
else
print_error "✗ $PART_NAME: MD5 mismatch!"
print_error " Expected: $EXPECTED_MD5"
print_error " Got: $ACTUAL_MD5"
ALL_PARTS_VALID=false
fi
done < "${BASE_NAME}.md5sums"
if [ "$ALL_PARTS_VALID" = false ]; then
print_error "Some parts failed MD5 verification. Aborting rejoin."
exit 1
fi
print_info "All parts verified successfully!"
# Rejoin the file
print_info "Step 2: Rejoining parts..."
cat ${BASE_NAME}.part_* > "$OUTPUT_FILE"
if [ ! -f "$OUTPUT_FILE" ]; then
print_error "Failed to create output file!"
exit 1
fi
OUTPUT_SIZE=$(stat -c%s "$OUTPUT_FILE" 2>/dev/null || stat -f%z "$OUTPUT_FILE")
print_info "Rejoined file size: $(numfmt --to=iec-i --suffix=B $OUTPUT_SIZE)"
# Verify rejoined file against original MD5
if [ $SKIP_ORIGINAL -eq 0 ]; then
print_info "Step 3: Verifying rejoined file against original MD5..."
ORIGINAL_MD5=$(cat "${BASE_NAME}.original.md5")
REJOINED_MD5=$(md5sum "$OUTPUT_FILE" | awk '{print $1}')
echo "Original MD5: $ORIGINAL_MD5"
echo "Rejoined MD5: $REJOINED_MD5"
if [ "$ORIGINAL_MD5" == "$REJOINED_MD5" ]; then
print_info "✓✓✓ SUCCESS! File integrity verified. MD5 checksums match!"
print_info "Rejoined file: $OUTPUT_FILE"
else
print_error "✗✗✗ FAILURE! MD5 checksum mismatch!"
print_error "The rejoined file may be corrupted!"
exit 1
fi
else
print_warning "Skipped original file verification (MD5 file not found)"
print_info "Rejoined file created: $OUTPUT_FILE"
fi
print_info "Done!"
REJOIN_EOF
chmod +x "$rejoin_script"
print_info "Created rejoin script: $rejoin_script"
# Upload all parts
print_info "Starting upload process for $base_name..."
local failed_uploads=()
for part in "${parts[@]}"; do
if ! upload_file "$part"; then
failed_uploads+=("$part")
fi
# Delay between uploads
if [ "$part" != "${parts[-1]}" ]; then
print_info "Waiting $DELAY_BETWEEN_UPLOADS seconds before next upload..."
sleep $DELAY_BETWEEN_UPLOADS
fi
done
# Upload metadata, MD5, and rejoin script
print_info "Uploading metadata files..."
upload_file "$metadata_file"
upload_file "$temp_dir/${base_name}.original.md5"
upload_file "$md5sums_file"
upload_file "$rejoin_script"
# Summary for this file
echo ""
print_section "Upload Summary for $base_name"
if [ ${#failed_uploads[@]} -eq 0 ]; then
print_info "✓ All parts uploaded successfully!"
print_info " Total parts: ${#parts[@]}"
print_info " MD5 checksums: Included"
print_info " Rejoin script: Uploaded"
UPLOAD_SUCCESS=true
else
print_error "✗ Some uploads failed:"
for failed in "${failed_uploads[@]}"; do
echo " - $(basename $failed)"
done
UPLOAD_SUCCESS=false
fi
# Store temp dir for cleanup
TEMP_DIRS+=("$temp_dir")
return $([ "$UPLOAD_SUCCESS" = true ] && echo 0 || echo 1)
}
# Main script starts here
print_section "Split, Upload & Verify Tool"
# Check if files provided as arguments
if [ $# -eq 0 ]; then
print_info "No files specified as arguments."
print_info "Please enter file paths (one per line, empty line to finish):"
SOURCE_FILES=()
while true; do
read -p "File path: " file_path
if [ -z "$file_path" ]; then
break
fi
if [ -f "$file_path" ]; then
SOURCE_FILES+=("$file_path")
print_info "Added: $file_path"
else
print_warning "File not found: $file_path (skipped)"
fi
done
else
# Files provided as command line arguments
SOURCE_FILES=("$@")
fi
# Validate we have files to process
if [ ${#SOURCE_FILES[@]} -eq 0 ]; then
print_error "No valid files to process!"
echo ""
echo "Usage: $0 [file1] [file2] [file3] ..."
echo " or: $0 (then enter files interactively)"
exit 1
fi
print_info "Files to process: ${#SOURCE_FILES[@]}"
for f in "${SOURCE_FILES[@]}"; do
echo " - $f"
done
# Confirm processing
echo ""
read -p "Continue with these files? (y/n): " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
print_info "Cancelled by user."
exit 0
fi
# Configuration confirmation
echo ""
print_info "Configuration:"
echo " - Number of parts per file: $NUM_PARTS"
echo " - Remote path: $REMOTE_PATH"
echo " - Delay between uploads: $DELAY_BETWEEN_UPLOADS seconds"
echo " - Upload retries: $UPLOAD_RETRIES"
echo ""
read -p "Modify configuration? (y/n): " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
read -p "Number of parts [$NUM_PARTS]: " input
NUM_PARTS=${input:-$NUM_PARTS}
read -p "Delay between uploads in seconds [$DELAY_BETWEEN_UPLOADS]: " input
DELAY_BETWEEN_UPLOADS=${input:-$DELAY_BETWEEN_UPLOADS}
read -p "Upload retries [$UPLOAD_RETRIES]: " input
UPLOAD_RETRIES=${input:-$UPLOAD_RETRIES}
print_info "Updated configuration applied."
fi
# Process all files
TEMP_DIRS=()
TOTAL_FILES=${#SOURCE_FILES[@]}
PROCESSED_FILES=0
FAILED_FILES=()
for i in "${!SOURCE_FILES[@]}"; do
file="${SOURCE_FILES[$i]}"
file_index=$((i + 1))
if process_file "$file" "$file_index" "$TOTAL_FILES"; then
PROCESSED_FILES=$((PROCESSED_FILES + 1))
else
FAILED_FILES+=("$file")
fi
# Delay between files
if [ $file_index -lt $TOTAL_FILES ]; then
echo ""
print_info "Waiting before processing next file..."
sleep $((DELAY_BETWEEN_UPLOADS / 2))
fi
done
# Final summary
echo ""
print_section "FINAL SUMMARY"
print_info "Total files processed: $PROCESSED_FILES / $TOTAL_FILES"
if [ ${#FAILED_FILES[@]} -gt 0 ]; then
print_error "Files with upload failures:"
for failed in "${FAILED_FILES[@]}"; do
echo " - $failed"
done
else
print_info "✓✓✓ All files processed and uploaded successfully!"
fi
# Instructions for rejoining
echo ""
print_section "How to Rejoin Files"
echo "On the remote system, for each file:"
echo " 1. Download all parts and associated files:"
echo " rclone copy $REMOTE_PATH . --include 'filename.*'"
echo ""
echo " 2. Run the rejoin script:"
echo " bash filename.rejoin.sh"
echo ""
echo " The script will:"
echo " - Verify MD5 of each part"
echo " - Rejoin the parts"
echo " - Verify the final file against original MD5"
# Cleanup
echo ""
read -p "Delete temporary directories? (y/n): " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
for temp_dir in "${TEMP_DIRS[@]}"; do
if [ -d "$temp_dir" ]; then
rm -rf "$temp_dir"
print_info "Deleted: $temp_dir"
fi
done
else
print_info "Temporary files kept in:"
for temp_dir in "${TEMP_DIRS[@]}"; do
echo " - $temp_dir"
done
fi
print_info "All done!"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment