Created
June 23, 2012 21:24
-
-
Save etal/2980066 to your computer and use it in GitHub Desktop.
Benchmark some operations on large trees in Bio.Phylo
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| """Benchmark several different operations.""" | |
| from time import time | |
| from contextlib import contextmanager | |
| try: | |
| from cStringIO import StringIO | |
| BytesIO = StringIO | |
| except ImportError: | |
| # Python 3 | |
| from io import StringIO, BytesIO | |
| from Bio import Phylo | |
| # --- Timer mini-framework ----------------------------------- | |
| def time_once(func_manager): | |
| """Time a single run.""" | |
| with func_manager() as func: | |
| t_start = time() | |
| func() | |
| t_end = time() | |
| return t_end - t_start | |
| def median_time(func_manager): | |
| """Take the median of multiple timed runs.""" | |
| N = 101 | |
| durations = [] | |
| for _i in range(N): | |
| durations.append(time_once(func_manager)) | |
| durations.sort() | |
| median = durations[N//2] | |
| return median | |
| def handle_manager(func, fname, fmt): | |
| """Manage a file handle in memory, to avoid re-reading from disk. | |
| Returns a context manager that manages the actual function, resetting the | |
| handle after each use. | |
| """ | |
| with open(fname) as file_handle: | |
| mem_handle = StringIO(file_handle.read()) | |
| def do_func(): | |
| func(mem_handle, fmt) | |
| @contextmanager | |
| def handle_manager(): | |
| try: | |
| yield do_func | |
| finally: | |
| mem_handle.seek(0) | |
| return handle_manager | |
| def tree_manager(func, fname, fmt): | |
| """Regenerate fresh copies of a tree from an in-memory string. | |
| Returns a context manager that managers the actual function, regenerating a | |
| new tree before each use. | |
| """ | |
| with open(fname) as file_handle: | |
| mem_handle = StringIO(file_handle.read()) | |
| @contextmanager | |
| def handle_manager(): | |
| mem_handle.seek(0) | |
| tree = Phylo.read(mem_handle, fmt) | |
| def do_func(): | |
| func(tree) | |
| yield do_func | |
| return handle_manager | |
| # --- Benchmark operations ----------------------------------- | |
| def parse_many(handle, fmt): | |
| """Parse a Newick file containing many trees.""" | |
| for tree in Phylo.parse(handle, fmt): | |
| pass | |
| def read_big(handle, fmt): | |
| """Read a file containing a single, large tree.""" | |
| Phylo.read(handle, fmt) | |
| def write_big(tree): | |
| """Write a single, large tree to file.""" | |
| Phylo.write(tree, StringIO(), 'newick') | |
| def write_big_xml(tree): | |
| """Write a single, large tree as PhyloXML.""" | |
| Phylo.write(tree, BytesIO(), 'phyloxml') | |
| def reroot_tree(tree): | |
| """Reroot a tree at every node.""" | |
| for node in list(tree.find_clades()): | |
| tree.root_with_outgroup(node) | |
| def collapse_all_lt50(tree): | |
| """Collapse all clades with bootstrap values < 50%.""" | |
| def is_weak_branch(clade): | |
| if clade.confidence is not None and clade.confidence < 50: | |
| return True | |
| return False | |
| for node in tree.get_nonterminals(): | |
| if node == tree.root: | |
| continue | |
| if tree.find_any(node) and is_weak_branch(node): | |
| tree.collapse(node) | |
| def total_branch_length(tree): | |
| """Sum all branch lengths in the tree.""" | |
| tree.total_branch_length() | |
| def ladderize(tree): | |
| tree.ladderize() | |
| def count_terminals(tree): | |
| tree.count_terminals() | |
| # --- Main script -------------------------------------------- | |
| # Data files | |
| # From: http://github.com/camwebb/tree-of-trees/ | |
| # https://raw.github.com/camwebb/tree-of-trees/master/megatrees_other/davies2004.bl.new | |
| EX_MEDIUM = 'davies2004.bl.new' # 440 terminals | |
| # Davies 2004 copies rerooted at each node | |
| EX_MANY = 'davies-reroot.bl.nwk' # 816 trees | |
| # ENH - get a Newick file of 1000 bootstrap trees | |
| # From: http://www.evoio.org/wiki/PhylotasticUseCases#Big_Trees | |
| # http://www.evoio.org/wg/evoio/images/3/37/Smith_2011_angiosperms.txt | |
| EX_BIG = 'Smith_2011_angiosperms.txt' # 55473 terminals | |
| # From phyloxml.org | |
| # converted from EX_BIG with phylo_converter | |
| EX_BIG_XML = 'Smith_2011_angiosperms.xml' # 55473 | |
| # Wrappers for running the benchmark operations | |
| benchmarks = ( | |
| ("read_big", handle_manager(read_big, EX_BIG, 'newick')), | |
| ("read_big_xml", handle_manager(read_big, EX_BIG_XML, 'phyloxml')), | |
| ("write_big", tree_manager(write_big, EX_BIG, 'newick')), | |
| ("write_big_xml", tree_manager(write_big_xml, EX_BIG, 'newick')), | |
| ("read_medium", handle_manager(read_big, EX_MEDIUM, 'newick')), | |
| ("parse_many", handle_manager(parse_many, EX_MANY, 'newick')), | |
| ("reroot_tree", tree_manager(reroot_tree, EX_MEDIUM, 'newick')), | |
| ("collapse_all_lt50", tree_manager(collapse_all_lt50, EX_MEDIUM, 'newick')), | |
| ("total_branch_length", tree_manager(total_branch_length, EX_MEDIUM, 'newick')), | |
| ("ladderize", tree_manager(ladderize, EX_MEDIUM, 'newick')), | |
| ("count_terminals", tree_manager(count_terminals, EX_MEDIUM, 'newick')), | |
| ) | |
| for bm_name, bm_runner in benchmarks: | |
| bm_time = median_time(bm_runner) | |
| print(bm_name.ljust(22) + str(bm_time)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment