Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save odinho/e425d9d260880339c342bb7fe139b34e to your computer and use it in GitHub Desktop.

Select an option

Save odinho/e425d9d260880339c342bb7fe139b34e to your computer and use it in GitHub Desktop.
Fixes up the backup files from SMS backup and restore to be better (resize images, remove videos, remove dupes)
import xml.etree.ElementTree as ET
import base64
from PIL import Image
import io
import sys
from pathlib import Path
import hashlib
def resize_base64_image(base64_string, max_size=(1024, 1024)):
"""
Resize a base64 encoded image while maintaining aspect ratio.
Returns the resized image as a base64 string.
"""
try:
# Decode base64 string to bytes
image_data = base64.b64decode(base64_string)
# Open image using PIL
with io.BytesIO(image_data) as img_io:
img = Image.open(img_io)
# Convert to RGB if necessary
if img.mode in ('RGBA', 'P'):
img = img.convert('RGB')
# Calculate new dimensions maintaining aspect ratio
ratio = min(max_size[0] / img.width, max_size[1] / img.height)
if ratio < 1: # Only resize if image is larger than max_size
new_size = (int(img.width * ratio), int(img.height * ratio))
img = img.resize(new_size, Image.Resampling.LANCZOS)
# Save resized image to bytes
output_buffer = io.BytesIO()
img.save(output_buffer, format='JPEG', quality=85, optimize=True)
# Convert back to base64
return base64.b64encode(output_buffer.getvalue()).decode('utf-8')
except Exception as e:
print(f"Error processing image: {str(e)}")
return base64_string
def get_message_key(msg_elem):
"""
Generate a unique key for a message based on its content and metadata,
regardless of whether it's SMS or MMS.
"""
# Get basic attributes
date = msg_elem.get('date', '')
address = msg_elem.get('address', '')
# Get message text content - handle both SMS and MMS formats
text = ''
if msg_elem.tag == 'sms':
text = msg_elem.get('body', '')
else: # mms
parts = msg_elem.find('parts')
if parts is not None:
for part in parts.findall('part'):
if part.get('ct') == 'text/plain':
text = part.get('text', '')
break
# Combine key elements and hash them
key_string = f"{date}|{address}|{text}"
return hashlib.md5(key_string.encode()).hexdigest()
def process_sms_backup(input_file, output_file):
"""
Process the SMS backup XML file, resize images, remove videos, deduplicate messages,
and save to a new file.
"""
# Parse the XML file
tree = ET.parse(input_file)
root = tree.getroot()
# Track statistics
total_images = 0
processed_images = 0
removed_videos = 0
removed_duplicates = 0
original_size = Path(input_file).stat().st_size
# Track unique messages and keep only the first occurrence
seen_messages = {}
unique_messages = []
# First pass: identify unique messages across both SMS and MMS
for msg in root.findall('.//sms') + root.findall('.//mms'):
msg_key = get_message_key(msg)
if msg_key not in seen_messages:
seen_messages[msg_key] = msg.tag # Store the type of the first occurrence
unique_messages.append(msg)
else:
# If this is an MMS and the previous was SMS, prefer the MMS version
if msg.tag == 'mms' and seen_messages[msg_key] == 'sms':
# Remove the SMS version and keep the MMS version
unique_messages = [m for m in unique_messages if get_message_key(m) != msg_key]
unique_messages.append(msg)
seen_messages[msg_key] = 'mms'
else:
removed_duplicates += 1
# Create new root with same attributes
new_root = ET.Element('smses')
for key, value in root.attrib.items():
new_root.set(key, value)
# Add unique messages to new root
for msg in unique_messages:
# Process images and videos before adding
if msg.tag == 'mms':
parts = msg.find('parts')
if parts is not None:
# Track parts to remove
parts_to_remove = []
for part in parts.findall('part'):
content_type = part.get('ct', '')
# Handle videos
if content_type.startswith('video/'):
parts_to_remove.append(part)
removed_videos += 1
continue
# Handle images
if content_type.startswith('image/'):
total_images += 1
image_data = part.get('data')
if image_data:
print(f"Processing image {total_images}...")
# Resize the image
resized_data = resize_base64_image(image_data)
part.set('data', resized_data)
processed_images += 1
# Remove video parts
for part in parts_to_remove:
parts.remove(part)
new_root.append(msg)
# Create new tree and write to file
new_tree = ET.ElementTree(new_root)
new_tree.write(output_file, encoding='utf-8', xml_declaration=True)
# Calculate final size
final_size = Path(output_file).stat().st_size
# Print statistics
print("\nProcessing complete!")
print(f"Total images found: {total_images}")
print(f"Images processed: {processed_images}")
print(f"Videos removed: {removed_videos}")
print(f"Duplicate messages removed: {removed_duplicates}")
print(f"Original file size: {original_size / 1024 / 1024:.2f} MB")
print(f"Final file size: {final_size / 1024 / 1024:.2f} MB")
print(f"Size reduction: {(1 - final_size/original_size) * 100:.1f}%")
def main():
if len(sys.argv) != 3:
print("Usage: python script.py input.xml output.xml")
sys.exit(1)
input_file = sys.argv[1]
output_file = sys.argv[2]
print(f"Processing {input_file}...")
process_sms_backup(input_file, output_file)
if __name__ == "__main__":
main()
@odinho
Copy link
Author

odinho commented Jan 23, 2025

SMS Backup File Processor

This script processes Android SMS backup XML files (as produced by "SMS Backup & Restore" app) to reduce file size and clean up the backup. It:

  1. Deduplicates messages by:

    • Removing exact duplicates
    • Removing SMS versions when an MMS version exists
    • Comparing date, recipient, and content to identify duplicates
  2. Processes images in MMS messages:

    • Resizes large images to max 1024x1024px (preserving aspect ratio)
    • Converts to JPEG format with quality=85
    • Optimizes compression
    • Only processes images larger than the target size
  3. Removes video attachments (which are typically backed up elsewhere)

The script preserves all message content and metadata while significantly reducing file size (typically 80-85% reduction). Running gzip on the output file provides additional compression.

Usage

pip install Pillow # unless you already have it
python script.py input.xml output.xml

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment