odinho/resize_android_sms_backup_and_restore_images_and_remove_dupes.py

## resize_android_sms_backup_and_restore_images_and_remove_dupes.py
import xml.etree.ElementTree as ET
import base64
from PIL import Image
import io
import sys
from pathlib import Path
import hashlib

def resize_base64_image(base64_string, max_size=(1024, 1024)):
    """
    Resize a base64 encoded image while maintaining aspect ratio.
    Returns the resized image as a base64 string.
    """
    try:
        # Decode base64 string to bytes
        image_data = base64.b64decode(base64_string)

        # Open image using PIL
        with io.BytesIO(image_data) as img_io:
            img = Image.open(img_io)

            # Convert to RGB if necessary
            if img.mode in ('RGBA', 'P'):
                img = img.convert('RGB')

            # Calculate new dimensions maintaining aspect ratio
            ratio = min(max_size[0] / img.width, max_size[1] / img.height)
            if ratio < 1:  # Only resize if image is larger than max_size
                new_size = (int(img.width * ratio), int(img.height * ratio))
                img = img.resize(new_size, Image.Resampling.LANCZOS)

            # Save resized image to bytes
            output_buffer = io.BytesIO()
            img.save(output_buffer, format='JPEG', quality=85, optimize=True)

            # Convert back to base64
            return base64.b64encode(output_buffer.getvalue()).decode('utf-8')

    except Exception as e:
        print(f"Error processing image: {str(e)}")
        return base64_string

def get_message_key(msg_elem):
    """
    Generate a unique key for a message based on its content and metadata,
    regardless of whether it's SMS or MMS.
    """
    # Get basic attributes
    date = msg_elem.get('date', '')
    address = msg_elem.get('address', '')

    # Get message text content - handle both SMS and MMS formats
    text = ''
    if msg_elem.tag == 'sms':
        text = msg_elem.get('body', '')
    else:  # mms
        parts = msg_elem.find('parts')
        if parts is not None:
            for part in parts.findall('part'):
                if part.get('ct') == 'text/plain':
                    text = part.get('text', '')
                    break

    # Combine key elements and hash them
    key_string = f"{date}|{address}|{text}"
    return hashlib.md5(key_string.encode()).hexdigest()

def process_sms_backup(input_file, output_file):
    """
    Process the SMS backup XML file, resize images, remove videos, deduplicate messages,
    and save to a new file.
    """
    # Parse the XML file
    tree = ET.parse(input_file)
    root = tree.getroot()

    # Track statistics
    total_images = 0
    processed_images = 0
    removed_videos = 0
    removed_duplicates = 0
    original_size = Path(input_file).stat().st_size

    # Track unique messages and keep only the first occurrence
    seen_messages = {}
    unique_messages = []

    # First pass: identify unique messages across both SMS and MMS
    for msg in root.findall('.//sms') + root.findall('.//mms'):
        msg_key = get_message_key(msg)
        if msg_key not in seen_messages:
            seen_messages[msg_key] = msg.tag  # Store the type of the first occurrence
            unique_messages.append(msg)
        else:
            # If this is an MMS and the previous was SMS, prefer the MMS version
            if msg.tag == 'mms' and seen_messages[msg_key] == 'sms':
                # Remove the SMS version and keep the MMS version
                unique_messages = [m for m in unique_messages if get_message_key(m) != msg_key]
                unique_messages.append(msg)
                seen_messages[msg_key] = 'mms'
            else:
                removed_duplicates += 1

    # Create new root with same attributes
    new_root = ET.Element('smses')
    for key, value in root.attrib.items():
        new_root.set(key, value)

    # Add unique messages to new root
    for msg in unique_messages:
        # Process images and videos before adding
        if msg.tag == 'mms':
            parts = msg.find('parts')
            if parts is not None:
                # Track parts to remove
                parts_to_remove = []

                for part in parts.findall('part'):
                    content_type = part.get('ct', '')

                    # Handle videos
                    if content_type.startswith('video/'):
                        parts_to_remove.append(part)
                        removed_videos += 1
                        continue

                    # Handle images
                    if content_type.startswith('image/'):
                        total_images += 1
                        image_data = part.get('data')

                        if image_data:
                            print(f"Processing image {total_images}...")
                            # Resize the image
                            resized_data = resize_base64_image(image_data)
                            part.set('data', resized_data)
                            processed_images += 1

                # Remove video parts
                for part in parts_to_remove:
                    parts.remove(part)

        new_root.append(msg)

    # Create new tree and write to file
    new_tree = ET.ElementTree(new_root)
    new_tree.write(output_file, encoding='utf-8', xml_declaration=True)

    # Calculate final size
    final_size = Path(output_file).stat().st_size

    # Print statistics
    print("\nProcessing complete!")
    print(f"Total images found: {total_images}")
    print(f"Images processed: {processed_images}")
    print(f"Videos removed: {removed_videos}")
    print(f"Duplicate messages removed: {removed_duplicates}")
    print(f"Original file size: {original_size / 1024 / 1024:.2f} MB")
    print(f"Final file size: {final_size / 1024 / 1024:.2f} MB")
    print(f"Size reduction: {(1 - final_size/original_size) * 100:.1f}%")

def main():
    if len(sys.argv) != 3:
        print("Usage: python script.py input.xml output.xml")
        sys.exit(1)

    input_file = sys.argv[1]
    output_file = sys.argv[2]

    print(f"Processing {input_file}...")
    process_sms_backup(input_file, output_file)

if __name__ == "__main__":
    main()
	import xml.etree.ElementTree as ET
	import base64
	from PIL import Image
	import io
	import sys
	from pathlib import Path
	import hashlib

	def resize_base64_image(base64_string, max_size=(1024, 1024)):
	"""
	Resize a base64 encoded image while maintaining aspect ratio.
	Returns the resized image as a base64 string.
	"""
	try:
	# Decode base64 string to bytes
	image_data = base64.b64decode(base64_string)

	# Open image using PIL
	with io.BytesIO(image_data) as img_io:
	img = Image.open(img_io)

	# Convert to RGB if necessary
	if img.mode in ('RGBA', 'P'):
	img = img.convert('RGB')

	# Calculate new dimensions maintaining aspect ratio
	ratio = min(max_size[0] / img.width, max_size[1] / img.height)
	if ratio < 1: # Only resize if image is larger than max_size
	new_size = (int(img.width * ratio), int(img.height * ratio))
	img = img.resize(new_size, Image.Resampling.LANCZOS)

	# Save resized image to bytes
	output_buffer = io.BytesIO()
	img.save(output_buffer, format='JPEG', quality=85, optimize=True)

	# Convert back to base64
	return base64.b64encode(output_buffer.getvalue()).decode('utf-8')

	except Exception as e:
	print(f"Error processing image: {str(e)}")
	return base64_string

	def get_message_key(msg_elem):
	"""
	Generate a unique key for a message based on its content and metadata,
	regardless of whether it's SMS or MMS.
	"""
	# Get basic attributes
	date = msg_elem.get('date', '')
	address = msg_elem.get('address', '')

	# Get message text content - handle both SMS and MMS formats
	text = ''
	if msg_elem.tag == 'sms':
	text = msg_elem.get('body', '')
	else: # mms
	parts = msg_elem.find('parts')
	if parts is not None:
	for part in parts.findall('part'):
	if part.get('ct') == 'text/plain':
	text = part.get('text', '')
	break

	# Combine key elements and hash them
	key_string = f"{date}\|{address}\|{text}"
	return hashlib.md5(key_string.encode()).hexdigest()

	def process_sms_backup(input_file, output_file):
	"""
	Process the SMS backup XML file, resize images, remove videos, deduplicate messages,
	and save to a new file.
	"""
	# Parse the XML file
	tree = ET.parse(input_file)
	root = tree.getroot()

	# Track statistics
	total_images = 0
	processed_images = 0
	removed_videos = 0
	removed_duplicates = 0
	original_size = Path(input_file).stat().st_size

	# Track unique messages and keep only the first occurrence
	seen_messages = {}
	unique_messages = []

	# First pass: identify unique messages across both SMS and MMS
	for msg in root.findall('.//sms') + root.findall('.//mms'):
	msg_key = get_message_key(msg)
	if msg_key not in seen_messages:
	seen_messages[msg_key] = msg.tag # Store the type of the first occurrence
	unique_messages.append(msg)
	else:
	# If this is an MMS and the previous was SMS, prefer the MMS version
	if msg.tag == 'mms' and seen_messages[msg_key] == 'sms':
	# Remove the SMS version and keep the MMS version
	unique_messages = [m for m in unique_messages if get_message_key(m) != msg_key]
	unique_messages.append(msg)
	seen_messages[msg_key] = 'mms'
	else:
	removed_duplicates += 1

	# Create new root with same attributes
	new_root = ET.Element('smses')
	for key, value in root.attrib.items():
	new_root.set(key, value)

	# Add unique messages to new root
	for msg in unique_messages:
	# Process images and videos before adding
	if msg.tag == 'mms':
	parts = msg.find('parts')
	if parts is not None:
	# Track parts to remove
	parts_to_remove = []

	for part in parts.findall('part'):
	content_type = part.get('ct', '')

	# Handle videos
	if content_type.startswith('video/'):
	parts_to_remove.append(part)
	removed_videos += 1
	continue

	# Handle images
	if content_type.startswith('image/'):
	total_images += 1
	image_data = part.get('data')

	if image_data:
	print(f"Processing image {total_images}...")
	# Resize the image
	resized_data = resize_base64_image(image_data)
	part.set('data', resized_data)
	processed_images += 1

	# Remove video parts
	for part in parts_to_remove:
	parts.remove(part)

	new_root.append(msg)

	# Create new tree and write to file
	new_tree = ET.ElementTree(new_root)
	new_tree.write(output_file, encoding='utf-8', xml_declaration=True)

	# Calculate final size
	final_size = Path(output_file).stat().st_size

	# Print statistics
	print("\nProcessing complete!")
	print(f"Total images found: {total_images}")
	print(f"Images processed: {processed_images}")
	print(f"Videos removed: {removed_videos}")
	print(f"Duplicate messages removed: {removed_duplicates}")
	print(f"Original file size: {original_size / 1024 / 1024:.2f} MB")
	print(f"Final file size: {final_size / 1024 / 1024:.2f} MB")
	print(f"Size reduction: {(1 - final_size/original_size) * 100:.1f}%")

	def main():
	if len(sys.argv) != 3:
	print("Usage: python script.py input.xml output.xml")
	sys.exit(1)

	input_file = sys.argv[1]
	output_file = sys.argv[2]

	print(f"Processing {input_file}...")
	process_sms_backup(input_file, output_file)

	if __name__ == "__main__":
	main()
No results found