Skip to content

Instantly share code, notes, and snippets.

@az0
Created January 22, 2026 03:29
Show Gist options
  • Select an option

  • Save az0/bbb379c36f5b706c9584a152cbe5932f to your computer and use it in GitHub Desktop.

Select an option

Save az0/bbb379c36f5b706c9584a152cbe5932f to your computer and use it in GitHub Desktop.
"""
Benchmark improvement in BleachBit `CleanerML.py`
This script isolates the XML parsing from other operations, except
disk I/O, but the OS should cache it.
How to run:
- copy this script into the root directory of the bleachbit repo
- put the old parser in bleachbit/CleanerML.py
- put the new parser in bleachbit/CleanerML_new.py
- run this script
Copyright (C) 2026 Andrew Ziem
Licensed under the GNU General Public License version 3 or later.
"""
import time
import statistics
from bleachbit import CleanerML as CleanerML_old
from bleachbit import CleanerML_new as CleanerML_new_module
parser_variants = (
('old', CleanerML_old),
('new', CleanerML_new_module),
)
def run_parser(parser_impl, paths):
"""Run one parser once and return the time taken"""
start = time.perf_counter()
CleanerClass = parser_impl.CleanerML
# Loop over all CleanerML files.
for pathname in paths:
# Caution: file loading can be affected by storage I/O, so
# use fast storage
xmlcleaner = CleanerClass(pathname)
_cleaner = xmlcleaner.get_cleaner()
return time.perf_counter() - start
def eval_native_xml(iterations_per_parser=50):
"""Run both parsers many times and calculate summaries
To avoid any bias, interleave runs of the two parsers.
"""
paths = list(CleanerML_old.list_cleanerml_files())
timings = {variant_label: [] for variant_label, _ in parser_variants}
run_counts = {variant_label: 0 for variant_label, _ in parser_variants}
total_runs = iterations_per_parser * len(parser_variants)
for i in range(total_runs):
variant_label, parser_impl = parser_variants[i % len(parser_variants)]
run_counts[variant_label] += 1
elapsed = run_parser(parser_impl, paths)
timings[variant_label].append(elapsed)
print(f"{variant_label} parser run {run_counts[variant_label]}/{iterations_per_parser} took {elapsed:.3f} seconds")
for variant_label in timings:
runs = timings[variant_label]
steady = runs[1:] if len(runs) >= 2 else runs
if steady:
print(f"{variant_label} parser median (excluding first): {statistics.median(steady):.3f} seconds")
else:
print(f"{variant_label} parser had no successful runs")
def main():
"""Main entry point"""
print('starting')
eval_native_xml()
# demonstrate detection using hasattr().
for variant_label, variant_module in parser_variants:
if hasattr(variant_module, '_ETSimpleTextNode'):
print(f"{variant_label} module has _ETSimpleTextNode, so new parser")
else:
print(f"{variant_label} module lacks _ETSimpleTextNode, so old parser (minidom)")
print('all done')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment