az0/benchmark_cleanerml.py

## benchmark_cleanerml.py
"""
Benchmark improvement in BleachBit `CleanerML.py`

This script isolates the XML parsing from other operations, except
disk I/O, but the OS should cache it.

How to run:
 - copy this script into the root directory of the bleachbit repo
 - put the old parser in bleachbit/CleanerML.py
 - put the new parser in bleachbit/CleanerML_new.py
 - run this script

Copyright (C) 2026 Andrew Ziem
Licensed under the GNU General Public License version 3 or later.
"""

import time
import statistics

from bleachbit import CleanerML as CleanerML_old
from bleachbit import CleanerML_new as CleanerML_new_module

parser_variants = (
    ('old', CleanerML_old),
    ('new', CleanerML_new_module),
)

def run_parser(parser_impl, paths):
    """Run one parser once and return the time taken"""
    start = time.perf_counter()
    CleanerClass = parser_impl.CleanerML
    # Loop over all CleanerML files.
    for pathname in paths:
        # Caution: file loading can be affected by storage I/O, so
        # use fast storage
        xmlcleaner = CleanerClass(pathname)
        _cleaner = xmlcleaner.get_cleaner()
    return time.perf_counter() - start


def eval_native_xml(iterations_per_parser=50):
    """Run both parsers many times and calculate summaries

    To avoid any bias, interleave runs of the two parsers.
    """
    paths = list(CleanerML_old.list_cleanerml_files())

    timings = {variant_label: [] for variant_label, _ in parser_variants}
    run_counts = {variant_label: 0 for variant_label, _ in parser_variants}

    total_runs = iterations_per_parser * len(parser_variants)
    for i in range(total_runs):
        variant_label, parser_impl = parser_variants[i % len(parser_variants)]
        run_counts[variant_label] += 1
        elapsed = run_parser(parser_impl, paths)
        timings[variant_label].append(elapsed)
        print(f"{variant_label} parser run {run_counts[variant_label]}/{iterations_per_parser} took {elapsed:.3f} seconds")

    for variant_label in timings:
        runs = timings[variant_label]
        steady = runs[1:] if len(runs) >= 2 else runs
        if steady:
            print(f"{variant_label} parser median (excluding first): {statistics.median(steady):.3f} seconds")
        else:
            print(f"{variant_label} parser had no successful runs")

def main():
    """Main entry point"""
    print('starting')
    eval_native_xml()

    # demonstrate detection using hasattr().
    for variant_label, variant_module in parser_variants:
        if hasattr(variant_module, '_ETSimpleTextNode'):
            print(f"{variant_label} module has _ETSimpleTextNode, so new parser")
        else:
            print(f"{variant_label} module lacks _ETSimpleTextNode, so old parser (minidom)")

    print('all done')


if __name__ == '__main__':
    main()
	"""
	Benchmark improvement in BleachBit `CleanerML.py`

	This script isolates the XML parsing from other operations, except
	disk I/O, but the OS should cache it.

	How to run:
	- copy this script into the root directory of the bleachbit repo
	- put the old parser in bleachbit/CleanerML.py
	- put the new parser in bleachbit/CleanerML_new.py
	- run this script

	Copyright (C) 2026 Andrew Ziem
	Licensed under the GNU General Public License version 3 or later.
	"""

	import time
	import statistics

	from bleachbit import CleanerML as CleanerML_old
	from bleachbit import CleanerML_new as CleanerML_new_module

	parser_variants = (
	('old', CleanerML_old),
	('new', CleanerML_new_module),
	)

	def run_parser(parser_impl, paths):
	"""Run one parser once and return the time taken"""
	start = time.perf_counter()
	CleanerClass = parser_impl.CleanerML
	# Loop over all CleanerML files.
	for pathname in paths:
	# Caution: file loading can be affected by storage I/O, so
	# use fast storage
	xmlcleaner = CleanerClass(pathname)
	_cleaner = xmlcleaner.get_cleaner()
	return time.perf_counter() - start


	def eval_native_xml(iterations_per_parser=50):
	"""Run both parsers many times and calculate summaries

	To avoid any bias, interleave runs of the two parsers.
	"""
	paths = list(CleanerML_old.list_cleanerml_files())

	timings = {variant_label: [] for variant_label, _ in parser_variants}
	run_counts = {variant_label: 0 for variant_label, _ in parser_variants}

	total_runs = iterations_per_parser * len(parser_variants)
	for i in range(total_runs):
	variant_label, parser_impl = parser_variants[i % len(parser_variants)]
	run_counts[variant_label] += 1
	elapsed = run_parser(parser_impl, paths)
	timings[variant_label].append(elapsed)
	print(f"{variant_label} parser run {run_counts[variant_label]}/{iterations_per_parser} took {elapsed:.3f} seconds")

	for variant_label in timings:
	runs = timings[variant_label]
	steady = runs[1:] if len(runs) >= 2 else runs
	if steady:
	print(f"{variant_label} parser median (excluding first): {statistics.median(steady):.3f} seconds")
	else:
	print(f"{variant_label} parser had no successful runs")

	def main():
	"""Main entry point"""
	print('starting')
	eval_native_xml()

	# demonstrate detection using hasattr().
	for variant_label, variant_module in parser_variants:
	if hasattr(variant_module, '_ETSimpleTextNode'):
	print(f"{variant_label} module has _ETSimpleTextNode, so new parser")
	else:
	print(f"{variant_label} module lacks _ETSimpleTextNode, so old parser (minidom)")

	print('all done')


	if __name__ == '__main__':
	main()
No results found