Skip to content

Instantly share code, notes, and snippets.

@etal
Created June 23, 2012 21:24
Show Gist options
  • Select an option

  • Save etal/2980066 to your computer and use it in GitHub Desktop.

Select an option

Save etal/2980066 to your computer and use it in GitHub Desktop.
Benchmark some operations on large trees in Bio.Phylo
#!/usr/bin/env python
"""Benchmark several different operations."""
from time import time
from contextlib import contextmanager
try:
from cStringIO import StringIO
BytesIO = StringIO
except ImportError:
# Python 3
from io import StringIO, BytesIO
from Bio import Phylo
# --- Timer mini-framework -----------------------------------
def time_once(func_manager):
"""Time a single run."""
with func_manager() as func:
t_start = time()
func()
t_end = time()
return t_end - t_start
def median_time(func_manager):
"""Take the median of multiple timed runs."""
N = 101
durations = []
for _i in range(N):
durations.append(time_once(func_manager))
durations.sort()
median = durations[N//2]
return median
def handle_manager(func, fname, fmt):
"""Manage a file handle in memory, to avoid re-reading from disk.
Returns a context manager that manages the actual function, resetting the
handle after each use.
"""
with open(fname) as file_handle:
mem_handle = StringIO(file_handle.read())
def do_func():
func(mem_handle, fmt)
@contextmanager
def handle_manager():
try:
yield do_func
finally:
mem_handle.seek(0)
return handle_manager
def tree_manager(func, fname, fmt):
"""Regenerate fresh copies of a tree from an in-memory string.
Returns a context manager that managers the actual function, regenerating a
new tree before each use.
"""
with open(fname) as file_handle:
mem_handle = StringIO(file_handle.read())
@contextmanager
def handle_manager():
mem_handle.seek(0)
tree = Phylo.read(mem_handle, fmt)
def do_func():
func(tree)
yield do_func
return handle_manager
# --- Benchmark operations -----------------------------------
def parse_many(handle, fmt):
"""Parse a Newick file containing many trees."""
for tree in Phylo.parse(handle, fmt):
pass
def read_big(handle, fmt):
"""Read a file containing a single, large tree."""
Phylo.read(handle, fmt)
def write_big(tree):
"""Write a single, large tree to file."""
Phylo.write(tree, StringIO(), 'newick')
def write_big_xml(tree):
"""Write a single, large tree as PhyloXML."""
Phylo.write(tree, BytesIO(), 'phyloxml')
def reroot_tree(tree):
"""Reroot a tree at every node."""
for node in list(tree.find_clades()):
tree.root_with_outgroup(node)
def collapse_all_lt50(tree):
"""Collapse all clades with bootstrap values < 50%."""
def is_weak_branch(clade):
if clade.confidence is not None and clade.confidence < 50:
return True
return False
for node in tree.get_nonterminals():
if node == tree.root:
continue
if tree.find_any(node) and is_weak_branch(node):
tree.collapse(node)
def total_branch_length(tree):
"""Sum all branch lengths in the tree."""
tree.total_branch_length()
def ladderize(tree):
tree.ladderize()
def count_terminals(tree):
tree.count_terminals()
# --- Main script --------------------------------------------
# Data files
# From: http://github.com/camwebb/tree-of-trees/
# https://raw.github.com/camwebb/tree-of-trees/master/megatrees_other/davies2004.bl.new
EX_MEDIUM = 'davies2004.bl.new' # 440 terminals
# Davies 2004 copies rerooted at each node
EX_MANY = 'davies-reroot.bl.nwk' # 816 trees
# ENH - get a Newick file of 1000 bootstrap trees
# From: http://www.evoio.org/wiki/PhylotasticUseCases#Big_Trees
# http://www.evoio.org/wg/evoio/images/3/37/Smith_2011_angiosperms.txt
EX_BIG = 'Smith_2011_angiosperms.txt' # 55473 terminals
# From phyloxml.org
# converted from EX_BIG with phylo_converter
EX_BIG_XML = 'Smith_2011_angiosperms.xml' # 55473
# Wrappers for running the benchmark operations
benchmarks = (
("read_big", handle_manager(read_big, EX_BIG, 'newick')),
("read_big_xml", handle_manager(read_big, EX_BIG_XML, 'phyloxml')),
("write_big", tree_manager(write_big, EX_BIG, 'newick')),
("write_big_xml", tree_manager(write_big_xml, EX_BIG, 'newick')),
("read_medium", handle_manager(read_big, EX_MEDIUM, 'newick')),
("parse_many", handle_manager(parse_many, EX_MANY, 'newick')),
("reroot_tree", tree_manager(reroot_tree, EX_MEDIUM, 'newick')),
("collapse_all_lt50", tree_manager(collapse_all_lt50, EX_MEDIUM, 'newick')),
("total_branch_length", tree_manager(total_branch_length, EX_MEDIUM, 'newick')),
("ladderize", tree_manager(ladderize, EX_MEDIUM, 'newick')),
("count_terminals", tree_manager(count_terminals, EX_MEDIUM, 'newick')),
)
for bm_name, bm_runner in benchmarks:
bm_time = median_time(bm_runner)
print(bm_name.ljust(22) + str(bm_time))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment