Created
January 22, 2026 03:29
-
-
Save az0/bbb379c36f5b706c9584a152cbe5932f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Benchmark improvement in BleachBit `CleanerML.py` | |
| This script isolates the XML parsing from other operations, except | |
| disk I/O, but the OS should cache it. | |
| How to run: | |
| - copy this script into the root directory of the bleachbit repo | |
| - put the old parser in bleachbit/CleanerML.py | |
| - put the new parser in bleachbit/CleanerML_new.py | |
| - run this script | |
| Copyright (C) 2026 Andrew Ziem | |
| Licensed under the GNU General Public License version 3 or later. | |
| """ | |
| import time | |
| import statistics | |
| from bleachbit import CleanerML as CleanerML_old | |
| from bleachbit import CleanerML_new as CleanerML_new_module | |
| parser_variants = ( | |
| ('old', CleanerML_old), | |
| ('new', CleanerML_new_module), | |
| ) | |
| def run_parser(parser_impl, paths): | |
| """Run one parser once and return the time taken""" | |
| start = time.perf_counter() | |
| CleanerClass = parser_impl.CleanerML | |
| # Loop over all CleanerML files. | |
| for pathname in paths: | |
| # Caution: file loading can be affected by storage I/O, so | |
| # use fast storage | |
| xmlcleaner = CleanerClass(pathname) | |
| _cleaner = xmlcleaner.get_cleaner() | |
| return time.perf_counter() - start | |
| def eval_native_xml(iterations_per_parser=50): | |
| """Run both parsers many times and calculate summaries | |
| To avoid any bias, interleave runs of the two parsers. | |
| """ | |
| paths = list(CleanerML_old.list_cleanerml_files()) | |
| timings = {variant_label: [] for variant_label, _ in parser_variants} | |
| run_counts = {variant_label: 0 for variant_label, _ in parser_variants} | |
| total_runs = iterations_per_parser * len(parser_variants) | |
| for i in range(total_runs): | |
| variant_label, parser_impl = parser_variants[i % len(parser_variants)] | |
| run_counts[variant_label] += 1 | |
| elapsed = run_parser(parser_impl, paths) | |
| timings[variant_label].append(elapsed) | |
| print(f"{variant_label} parser run {run_counts[variant_label]}/{iterations_per_parser} took {elapsed:.3f} seconds") | |
| for variant_label in timings: | |
| runs = timings[variant_label] | |
| steady = runs[1:] if len(runs) >= 2 else runs | |
| if steady: | |
| print(f"{variant_label} parser median (excluding first): {statistics.median(steady):.3f} seconds") | |
| else: | |
| print(f"{variant_label} parser had no successful runs") | |
| def main(): | |
| """Main entry point""" | |
| print('starting') | |
| eval_native_xml() | |
| # demonstrate detection using hasattr(). | |
| for variant_label, variant_module in parser_variants: | |
| if hasattr(variant_module, '_ETSimpleTextNode'): | |
| print(f"{variant_label} module has _ETSimpleTextNode, so new parser") | |
| else: | |
| print(f"{variant_label} module lacks _ETSimpleTextNode, so old parser (minidom)") | |
| print('all done') | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment