Created
January 23, 2025 15:58
-
-
Save odinho/e425d9d260880339c342bb7fe139b34e to your computer and use it in GitHub Desktop.
Fixes up the backup files from SMS backup and restore to be better (resize images, remove videos, remove dupes)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import xml.etree.ElementTree as ET | |
| import base64 | |
| from PIL import Image | |
| import io | |
| import sys | |
| from pathlib import Path | |
| import hashlib | |
| def resize_base64_image(base64_string, max_size=(1024, 1024)): | |
| """ | |
| Resize a base64 encoded image while maintaining aspect ratio. | |
| Returns the resized image as a base64 string. | |
| """ | |
| try: | |
| # Decode base64 string to bytes | |
| image_data = base64.b64decode(base64_string) | |
| # Open image using PIL | |
| with io.BytesIO(image_data) as img_io: | |
| img = Image.open(img_io) | |
| # Convert to RGB if necessary | |
| if img.mode in ('RGBA', 'P'): | |
| img = img.convert('RGB') | |
| # Calculate new dimensions maintaining aspect ratio | |
| ratio = min(max_size[0] / img.width, max_size[1] / img.height) | |
| if ratio < 1: # Only resize if image is larger than max_size | |
| new_size = (int(img.width * ratio), int(img.height * ratio)) | |
| img = img.resize(new_size, Image.Resampling.LANCZOS) | |
| # Save resized image to bytes | |
| output_buffer = io.BytesIO() | |
| img.save(output_buffer, format='JPEG', quality=85, optimize=True) | |
| # Convert back to base64 | |
| return base64.b64encode(output_buffer.getvalue()).decode('utf-8') | |
| except Exception as e: | |
| print(f"Error processing image: {str(e)}") | |
| return base64_string | |
| def get_message_key(msg_elem): | |
| """ | |
| Generate a unique key for a message based on its content and metadata, | |
| regardless of whether it's SMS or MMS. | |
| """ | |
| # Get basic attributes | |
| date = msg_elem.get('date', '') | |
| address = msg_elem.get('address', '') | |
| # Get message text content - handle both SMS and MMS formats | |
| text = '' | |
| if msg_elem.tag == 'sms': | |
| text = msg_elem.get('body', '') | |
| else: # mms | |
| parts = msg_elem.find('parts') | |
| if parts is not None: | |
| for part in parts.findall('part'): | |
| if part.get('ct') == 'text/plain': | |
| text = part.get('text', '') | |
| break | |
| # Combine key elements and hash them | |
| key_string = f"{date}|{address}|{text}" | |
| return hashlib.md5(key_string.encode()).hexdigest() | |
| def process_sms_backup(input_file, output_file): | |
| """ | |
| Process the SMS backup XML file, resize images, remove videos, deduplicate messages, | |
| and save to a new file. | |
| """ | |
| # Parse the XML file | |
| tree = ET.parse(input_file) | |
| root = tree.getroot() | |
| # Track statistics | |
| total_images = 0 | |
| processed_images = 0 | |
| removed_videos = 0 | |
| removed_duplicates = 0 | |
| original_size = Path(input_file).stat().st_size | |
| # Track unique messages and keep only the first occurrence | |
| seen_messages = {} | |
| unique_messages = [] | |
| # First pass: identify unique messages across both SMS and MMS | |
| for msg in root.findall('.//sms') + root.findall('.//mms'): | |
| msg_key = get_message_key(msg) | |
| if msg_key not in seen_messages: | |
| seen_messages[msg_key] = msg.tag # Store the type of the first occurrence | |
| unique_messages.append(msg) | |
| else: | |
| # If this is an MMS and the previous was SMS, prefer the MMS version | |
| if msg.tag == 'mms' and seen_messages[msg_key] == 'sms': | |
| # Remove the SMS version and keep the MMS version | |
| unique_messages = [m for m in unique_messages if get_message_key(m) != msg_key] | |
| unique_messages.append(msg) | |
| seen_messages[msg_key] = 'mms' | |
| else: | |
| removed_duplicates += 1 | |
| # Create new root with same attributes | |
| new_root = ET.Element('smses') | |
| for key, value in root.attrib.items(): | |
| new_root.set(key, value) | |
| # Add unique messages to new root | |
| for msg in unique_messages: | |
| # Process images and videos before adding | |
| if msg.tag == 'mms': | |
| parts = msg.find('parts') | |
| if parts is not None: | |
| # Track parts to remove | |
| parts_to_remove = [] | |
| for part in parts.findall('part'): | |
| content_type = part.get('ct', '') | |
| # Handle videos | |
| if content_type.startswith('video/'): | |
| parts_to_remove.append(part) | |
| removed_videos += 1 | |
| continue | |
| # Handle images | |
| if content_type.startswith('image/'): | |
| total_images += 1 | |
| image_data = part.get('data') | |
| if image_data: | |
| print(f"Processing image {total_images}...") | |
| # Resize the image | |
| resized_data = resize_base64_image(image_data) | |
| part.set('data', resized_data) | |
| processed_images += 1 | |
| # Remove video parts | |
| for part in parts_to_remove: | |
| parts.remove(part) | |
| new_root.append(msg) | |
| # Create new tree and write to file | |
| new_tree = ET.ElementTree(new_root) | |
| new_tree.write(output_file, encoding='utf-8', xml_declaration=True) | |
| # Calculate final size | |
| final_size = Path(output_file).stat().st_size | |
| # Print statistics | |
| print("\nProcessing complete!") | |
| print(f"Total images found: {total_images}") | |
| print(f"Images processed: {processed_images}") | |
| print(f"Videos removed: {removed_videos}") | |
| print(f"Duplicate messages removed: {removed_duplicates}") | |
| print(f"Original file size: {original_size / 1024 / 1024:.2f} MB") | |
| print(f"Final file size: {final_size / 1024 / 1024:.2f} MB") | |
| print(f"Size reduction: {(1 - final_size/original_size) * 100:.1f}%") | |
| def main(): | |
| if len(sys.argv) != 3: | |
| print("Usage: python script.py input.xml output.xml") | |
| sys.exit(1) | |
| input_file = sys.argv[1] | |
| output_file = sys.argv[2] | |
| print(f"Processing {input_file}...") | |
| process_sms_backup(input_file, output_file) | |
| if __name__ == "__main__": | |
| main() |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
SMS Backup File Processor
This script processes Android SMS backup XML files (as produced by "SMS Backup & Restore" app) to reduce file size and clean up the backup. It:
Deduplicates messages by:
Processes images in MMS messages:
Removes video attachments (which are typically backed up elsewhere)
The script preserves all message content and metadata while significantly reducing file size (typically 80-85% reduction). Running gzip on the output file provides additional compression.
Usage