Created
November 11, 2025 05:11
-
-
Save r4d10n/26dc351295a1a3a1a49918eb234177ba to your computer and use it in GitHub Desktop.
Script to split/rejoin a large file into/from multiparts with MD5 checksum checks and sync upload to a Rclone compatible drive (S3, Google Drive, etc.) to beat Request Rate Limiting
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Script to split/rejoin a large file into/from multiparts with MD5 checksum checks | |
| # Sync upload to a Rclone compatible drive (Google Drive, S3, etc.) to beat Request Rate Limiting | |
| !/bin/bash | |
| # Configuration | |
| REMOTE_PATH="drv:path" # use rclone configure to setup | |
| NUM_PARTS=20 | |
| DELAY_BETWEEN_UPLOADS=15 # seconds | |
| UPLOAD_RETRIES=3 | |
| # Colors for output | |
| GREEN='\033[0;32m' | |
| RED='\033[0;31m' | |
| YELLOW='\033[1;33m' | |
| BLUE='\033[0;34m' | |
| NC='\033[0m' # No Color | |
| # Function to print colored messages | |
| print_info() { | |
| echo -e "${GREEN}[INFO]${NC} $1" | |
| } | |
| print_error() { | |
| echo -e "${RED}[ERROR]${NC} $1" | |
| } | |
| print_warning() { | |
| echo -e "${YELLOW}[WARNING]${NC} $1" | |
| } | |
| print_section() { | |
| echo -e "${BLUE}[====== $1 ======]${NC}" | |
| } | |
| # Function to calculate MD5 | |
| calculate_md5() { | |
| local file=$1 | |
| md5sum "$file" | awk '{print $1}' | |
| } | |
| # Function to create MD5 for parts | |
| create_parts_md5() { | |
| local temp_dir=$1 | |
| local base_name=$2 | |
| local md5_file="$temp_dir/${base_name}.md5sums" | |
| print_info "Creating MD5 checksums for all parts..." | |
| > "$md5_file" # Clear/create file | |
| for part in "$temp_dir/${base_name}.part_"*; do | |
| if [ -f "$part" ]; then | |
| local part_name=$(basename "$part") | |
| local part_md5=$(calculate_md5 "$part") | |
| echo "$part_md5 $part_name" >> "$md5_file" | |
| print_info " $part_name: $part_md5" | |
| fi | |
| done | |
| echo "$md5_file" | |
| } | |
| # Upload function with retry logic | |
| upload_file() { | |
| local file=$1 | |
| local attempt=1 | |
| while [ $attempt -le $UPLOAD_RETRIES ]; do | |
| print_info "Uploading $(basename $file) (attempt $attempt/$UPLOAD_RETRIES)..." | |
| if rclone copy "$file" "$REMOTE_PATH" \ | |
| --transfers 1 \ | |
| --checkers 1 \ | |
| --retries 3 \ | |
| --low-level-retries 5 \ | |
| --timeout 2h \ | |
| --retries-sleep 10s \ | |
| -P; then | |
| print_info "✓ Successfully uploaded $(basename $file)" | |
| return 0 | |
| else | |
| print_warning "✗ Upload failed for $(basename $file), attempt $attempt/$UPLOAD_RETRIES" | |
| attempt=$((attempt + 1)) | |
| if [ $attempt -le $UPLOAD_RETRIES ]; then | |
| sleep $((DELAY_BETWEEN_UPLOADS * 2)) | |
| fi | |
| fi | |
| done | |
| print_error "Failed to upload $(basename $file) after $UPLOAD_RETRIES attempts" | |
| return 1 | |
| } | |
| # Process a single file | |
| process_file() { | |
| local source_file=$1 | |
| local file_index=$2 | |
| local total_files=$3 | |
| print_section "Processing file $file_index/$total_files: $source_file" | |
| # Check if source file exists | |
| if [ ! -f "$source_file" ]; then | |
| print_error "Source file '$source_file' not found!" | |
| return 1 | |
| fi | |
| # Get file info | |
| local file_size=$(stat -c%s "$source_file" 2>/dev/null || stat -f%z "$source_file") | |
| local base_name=$(basename "$source_file") | |
| print_info "File size: $(numfmt --to=iec-i --suffix=B $file_size)" | |
| # Create temporary directory for this file | |
| local temp_dir="./split_parts_${base_name}_$$" | |
| mkdir -p "$temp_dir" | |
| print_info "Created temporary directory: $temp_dir" | |
| # Calculate MD5 checksum of original file | |
| print_info "Calculating MD5 checksum of original file..." | |
| local original_md5=$(calculate_md5 "$source_file") | |
| echo "$original_md5" > "$temp_dir/${base_name}.original.md5" | |
| print_info "Original MD5: $original_md5" | |
| # Split the file | |
| print_info "Splitting file into $NUM_PARTS parts..." | |
| cd "$temp_dir" | |
| split -n $NUM_PARTS "../$source_file" "${base_name}.part_" | |
| cd .. | |
| # List parts | |
| local parts=($temp_dir/${base_name}.part_*) | |
| print_info "Created ${#parts[@]} parts:" | |
| for part in "${parts[@]}"; do | |
| local part_size=$(stat -c%s "$part" 2>/dev/null || stat -f%z "$part") | |
| echo " - $(basename $part): $(numfmt --to=iec-i --suffix=B $part_size)" | |
| done | |
| # Create MD5 checksums for all parts | |
| local md5sums_file=$(create_parts_md5 "$temp_dir" "$base_name") | |
| # Create metadata file | |
| local metadata_file="$temp_dir/${base_name}.metadata.txt" | |
| cat > "$metadata_file" << EOF | |
| Original File: $source_file | |
| Original Size: $file_size bytes | |
| Original MD5: $original_md5 | |
| Number of Parts: $NUM_PARTS | |
| Split Date: $(date) | |
| Parts: | |
| EOF | |
| for part in "${parts[@]}"; do | |
| echo " $(basename $part)" >> "$metadata_file" | |
| done | |
| print_info "Created metadata file: $metadata_file" | |
| # Create rejoin and verify script | |
| local rejoin_script="$temp_dir/${base_name}.rejoin.sh" | |
| cat > "$rejoin_script" << 'REJOIN_EOF' | |
| #!/bin/bash | |
| # Rejoin and Verify Script | |
| set -e | |
| GREEN='\033[0;32m' | |
| RED='\033[0;31m' | |
| YELLOW='\033[1;33m' | |
| NC='\033[0m' | |
| print_info() { | |
| echo -e "${GREEN}[INFO]${NC} $1" | |
| } | |
| print_error() { | |
| echo -e "${RED}[ERROR]${NC} $1" | |
| } | |
| print_warning() { | |
| echo -e "${YELLOW}[WARNING]${NC} $1" | |
| } | |
| # Find the base name from parts | |
| PARTS=(*.part_*) | |
| if [ ${#PARTS[@]} -eq 0 ]; then | |
| print_error "No part files found in current directory!" | |
| exit 1 | |
| fi | |
| # Extract base name (remove .part_xx suffix) | |
| BASE_NAME="${PARTS[0]%.part_*}" | |
| OUTPUT_FILE="$BASE_NAME" | |
| print_info "Base name detected: $BASE_NAME" | |
| print_info "Output file will be: $OUTPUT_FILE" | |
| # Check if all required files exist | |
| if [ ! -f "${BASE_NAME}.md5sums" ]; then | |
| print_error "MD5 checksums file not found: ${BASE_NAME}.md5sums" | |
| exit 1 | |
| fi | |
| if [ ! -f "${BASE_NAME}.original.md5" ]; then | |
| print_warning "Original MD5 file not found: ${BASE_NAME}.original.md5" | |
| print_warning "Will skip original file verification" | |
| SKIP_ORIGINAL=1 | |
| else | |
| SKIP_ORIGINAL=0 | |
| fi | |
| # Verify individual parts | |
| print_info "Step 1: Verifying MD5 checksums of individual parts..." | |
| ALL_PARTS_VALID=true | |
| while IFS= read -r line; do | |
| EXPECTED_MD5=$(echo "$line" | awk '{print $1}') | |
| PART_NAME=$(echo "$line" | awk '{print $2}') | |
| if [ ! -f "$PART_NAME" ]; then | |
| print_error "Part file missing: $PART_NAME" | |
| ALL_PARTS_VALID=false | |
| continue | |
| fi | |
| ACTUAL_MD5=$(md5sum "$PART_NAME" | awk '{print $1}') | |
| if [ "$EXPECTED_MD5" == "$ACTUAL_MD5" ]; then | |
| print_info "✓ $PART_NAME: MD5 verified" | |
| else | |
| print_error "✗ $PART_NAME: MD5 mismatch!" | |
| print_error " Expected: $EXPECTED_MD5" | |
| print_error " Got: $ACTUAL_MD5" | |
| ALL_PARTS_VALID=false | |
| fi | |
| done < "${BASE_NAME}.md5sums" | |
| if [ "$ALL_PARTS_VALID" = false ]; then | |
| print_error "Some parts failed MD5 verification. Aborting rejoin." | |
| exit 1 | |
| fi | |
| print_info "All parts verified successfully!" | |
| # Rejoin the file | |
| print_info "Step 2: Rejoining parts..." | |
| cat ${BASE_NAME}.part_* > "$OUTPUT_FILE" | |
| if [ ! -f "$OUTPUT_FILE" ]; then | |
| print_error "Failed to create output file!" | |
| exit 1 | |
| fi | |
| OUTPUT_SIZE=$(stat -c%s "$OUTPUT_FILE" 2>/dev/null || stat -f%z "$OUTPUT_FILE") | |
| print_info "Rejoined file size: $(numfmt --to=iec-i --suffix=B $OUTPUT_SIZE)" | |
| # Verify rejoined file against original MD5 | |
| if [ $SKIP_ORIGINAL -eq 0 ]; then | |
| print_info "Step 3: Verifying rejoined file against original MD5..." | |
| ORIGINAL_MD5=$(cat "${BASE_NAME}.original.md5") | |
| REJOINED_MD5=$(md5sum "$OUTPUT_FILE" | awk '{print $1}') | |
| echo "Original MD5: $ORIGINAL_MD5" | |
| echo "Rejoined MD5: $REJOINED_MD5" | |
| if [ "$ORIGINAL_MD5" == "$REJOINED_MD5" ]; then | |
| print_info "✓✓✓ SUCCESS! File integrity verified. MD5 checksums match!" | |
| print_info "Rejoined file: $OUTPUT_FILE" | |
| else | |
| print_error "✗✗✗ FAILURE! MD5 checksum mismatch!" | |
| print_error "The rejoined file may be corrupted!" | |
| exit 1 | |
| fi | |
| else | |
| print_warning "Skipped original file verification (MD5 file not found)" | |
| print_info "Rejoined file created: $OUTPUT_FILE" | |
| fi | |
| print_info "Done!" | |
| REJOIN_EOF | |
| chmod +x "$rejoin_script" | |
| print_info "Created rejoin script: $rejoin_script" | |
| # Upload all parts | |
| print_info "Starting upload process for $base_name..." | |
| local failed_uploads=() | |
| for part in "${parts[@]}"; do | |
| if ! upload_file "$part"; then | |
| failed_uploads+=("$part") | |
| fi | |
| # Delay between uploads | |
| if [ "$part" != "${parts[-1]}" ]; then | |
| print_info "Waiting $DELAY_BETWEEN_UPLOADS seconds before next upload..." | |
| sleep $DELAY_BETWEEN_UPLOADS | |
| fi | |
| done | |
| # Upload metadata, MD5, and rejoin script | |
| print_info "Uploading metadata files..." | |
| upload_file "$metadata_file" | |
| upload_file "$temp_dir/${base_name}.original.md5" | |
| upload_file "$md5sums_file" | |
| upload_file "$rejoin_script" | |
| # Summary for this file | |
| echo "" | |
| print_section "Upload Summary for $base_name" | |
| if [ ${#failed_uploads[@]} -eq 0 ]; then | |
| print_info "✓ All parts uploaded successfully!" | |
| print_info " Total parts: ${#parts[@]}" | |
| print_info " MD5 checksums: Included" | |
| print_info " Rejoin script: Uploaded" | |
| UPLOAD_SUCCESS=true | |
| else | |
| print_error "✗ Some uploads failed:" | |
| for failed in "${failed_uploads[@]}"; do | |
| echo " - $(basename $failed)" | |
| done | |
| UPLOAD_SUCCESS=false | |
| fi | |
| # Store temp dir for cleanup | |
| TEMP_DIRS+=("$temp_dir") | |
| return $([ "$UPLOAD_SUCCESS" = true ] && echo 0 || echo 1) | |
| } | |
| # Main script starts here | |
| print_section "Split, Upload & Verify Tool" | |
| # Check if files provided as arguments | |
| if [ $# -eq 0 ]; then | |
| print_info "No files specified as arguments." | |
| print_info "Please enter file paths (one per line, empty line to finish):" | |
| SOURCE_FILES=() | |
| while true; do | |
| read -p "File path: " file_path | |
| if [ -z "$file_path" ]; then | |
| break | |
| fi | |
| if [ -f "$file_path" ]; then | |
| SOURCE_FILES+=("$file_path") | |
| print_info "Added: $file_path" | |
| else | |
| print_warning "File not found: $file_path (skipped)" | |
| fi | |
| done | |
| else | |
| # Files provided as command line arguments | |
| SOURCE_FILES=("$@") | |
| fi | |
| # Validate we have files to process | |
| if [ ${#SOURCE_FILES[@]} -eq 0 ]; then | |
| print_error "No valid files to process!" | |
| echo "" | |
| echo "Usage: $0 [file1] [file2] [file3] ..." | |
| echo " or: $0 (then enter files interactively)" | |
| exit 1 | |
| fi | |
| print_info "Files to process: ${#SOURCE_FILES[@]}" | |
| for f in "${SOURCE_FILES[@]}"; do | |
| echo " - $f" | |
| done | |
| # Confirm processing | |
| echo "" | |
| read -p "Continue with these files? (y/n): " -n 1 -r | |
| echo | |
| if [[ ! $REPLY =~ ^[Yy]$ ]]; then | |
| print_info "Cancelled by user." | |
| exit 0 | |
| fi | |
| # Configuration confirmation | |
| echo "" | |
| print_info "Configuration:" | |
| echo " - Number of parts per file: $NUM_PARTS" | |
| echo " - Remote path: $REMOTE_PATH" | |
| echo " - Delay between uploads: $DELAY_BETWEEN_UPLOADS seconds" | |
| echo " - Upload retries: $UPLOAD_RETRIES" | |
| echo "" | |
| read -p "Modify configuration? (y/n): " -n 1 -r | |
| echo | |
| if [[ $REPLY =~ ^[Yy]$ ]]; then | |
| read -p "Number of parts [$NUM_PARTS]: " input | |
| NUM_PARTS=${input:-$NUM_PARTS} | |
| read -p "Delay between uploads in seconds [$DELAY_BETWEEN_UPLOADS]: " input | |
| DELAY_BETWEEN_UPLOADS=${input:-$DELAY_BETWEEN_UPLOADS} | |
| read -p "Upload retries [$UPLOAD_RETRIES]: " input | |
| UPLOAD_RETRIES=${input:-$UPLOAD_RETRIES} | |
| print_info "Updated configuration applied." | |
| fi | |
| # Process all files | |
| TEMP_DIRS=() | |
| TOTAL_FILES=${#SOURCE_FILES[@]} | |
| PROCESSED_FILES=0 | |
| FAILED_FILES=() | |
| for i in "${!SOURCE_FILES[@]}"; do | |
| file="${SOURCE_FILES[$i]}" | |
| file_index=$((i + 1)) | |
| if process_file "$file" "$file_index" "$TOTAL_FILES"; then | |
| PROCESSED_FILES=$((PROCESSED_FILES + 1)) | |
| else | |
| FAILED_FILES+=("$file") | |
| fi | |
| # Delay between files | |
| if [ $file_index -lt $TOTAL_FILES ]; then | |
| echo "" | |
| print_info "Waiting before processing next file..." | |
| sleep $((DELAY_BETWEEN_UPLOADS / 2)) | |
| fi | |
| done | |
| # Final summary | |
| echo "" | |
| print_section "FINAL SUMMARY" | |
| print_info "Total files processed: $PROCESSED_FILES / $TOTAL_FILES" | |
| if [ ${#FAILED_FILES[@]} -gt 0 ]; then | |
| print_error "Files with upload failures:" | |
| for failed in "${FAILED_FILES[@]}"; do | |
| echo " - $failed" | |
| done | |
| else | |
| print_info "✓✓✓ All files processed and uploaded successfully!" | |
| fi | |
| # Instructions for rejoining | |
| echo "" | |
| print_section "How to Rejoin Files" | |
| echo "On the remote system, for each file:" | |
| echo " 1. Download all parts and associated files:" | |
| echo " rclone copy $REMOTE_PATH . --include 'filename.*'" | |
| echo "" | |
| echo " 2. Run the rejoin script:" | |
| echo " bash filename.rejoin.sh" | |
| echo "" | |
| echo " The script will:" | |
| echo " - Verify MD5 of each part" | |
| echo " - Rejoin the parts" | |
| echo " - Verify the final file against original MD5" | |
| # Cleanup | |
| echo "" | |
| read -p "Delete temporary directories? (y/n): " -n 1 -r | |
| echo | |
| if [[ $REPLY =~ ^[Yy]$ ]]; then | |
| for temp_dir in "${TEMP_DIRS[@]}"; do | |
| if [ -d "$temp_dir" ]; then | |
| rm -rf "$temp_dir" | |
| print_info "Deleted: $temp_dir" | |
| fi | |
| done | |
| else | |
| print_info "Temporary files kept in:" | |
| for temp_dir in "${TEMP_DIRS[@]}"; do | |
| echo " - $temp_dir" | |
| done | |
| fi | |
| print_info "All done!" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment