Skip to content

Instantly share code, notes, and snippets.

@etal
Last active December 14, 2015 02:49
Show Gist options
  • Select an option

  • Save etal/5016162 to your computer and use it in GitHub Desktop.

Select an option

Save etal/5016162 to your computer and use it in GitHub Desktop.
Extract well-supported clades from a phylogenetic tree as sequence clusters.
#!/usr/bin/env python
"""Extract well-supported clades from a phylogenetic tree as sequence clusters."""
import os.path
import subprocess
import sys
import tempfile
from cStringIO import StringIO
from Bio import AlignIO, SeqIO, Phylo
from Bio.Phylo.BaseTree import BranchColor
from Bio.Graphics.ColorSpiral import ColorSpiral
# ENH:
# argparse: --size, --confidence, --tree, --outgroup
# cut on longest branches
MIN_CLUSTER_SIZE = 4
# MIN_CONFIDENCE = 0.7
fafname = sys.argv[1]
assert os.path.isfile(fafname)
try:
aln = AlignIO.read(fafname, 'fasta')
except:
# run mafft
alndata = subprocess.check_output(['mafft', '--quiet', '--auto', fafname])
aln = AlignIO.read(StringIO(alndata), 'fasta')
# ENH: blocks
with tempfile.NamedTemporaryFile(mode='w') as tmp:
AlignIO.write(aln, tmp, 'fasta')
tmp.flush()
treedata = subprocess.check_output(['fasttree', '-pseudo', '-gamma', '-wag',
tmp.name])
tree = Phylo.read(StringIO(treedata), 'newick')
# Collapse weakly supported splits
confs = [c.confidence for c in tree.find_clades() if c.confidence is not None]
MIN_CONFIDENCE = sum(confs) / len(confs)
tree.collapse_all(lambda c: c.confidence < MIN_CONFIDENCE)
# Extract remaining clusters of acceptable size
clusters = {}
unclustered = set()
seen_subnodes = set()
for clade in tree.find_clades(order='level'):
if clade in seen_subnodes or clade.confidence is None:
continue
elif clade.is_terminal():
unclustered.add(clade.name)
else:
# Viable clade, I guess?
# Extract all terminals into a new cluster
subnodes = list(clade.find_clades())
seen_subnodes.update(subnodes)
taxa = set([k.name for k in subnodes if k.is_terminal()])
if len(taxa) >= MIN_CLUSTER_SIZE:
clusters[clade] = taxa
else:
unclustered.update(taxa)
# Write the sequences in each cluster to individual files
# Also colorize the tree to show where the clusters are
seq_idx = SeqIO.to_dict(SeqIO.parse(fafname, 'fasta'))
colors = [BranchColor(*map(lambda x: int(x*255), rgb))
for rgb in ColorSpiral().get_colors(len(clusters))]
def write_cluster(cluster, fname):
records = [seq_idx[seqid] for seqid in sorted(cluster)]
SeqIO.write(records, fname, 'fasta')
print >>sys.stderr, "Wrote", fname
for i, item in enumerate(sorted(clusters.iteritems(), reverse=True,
key=lambda kv: len(kv[1]))):
clade, cluster = item
write_cluster(cluster, os.path.basename(fafname) + '.' + str(i))
clade.color = colors[i]
clade.width = 2
if unclustered:
write_cluster(unclustered, os.path.basename(fafname) + '.Unique')
treefname = os.path.basename(fafname) + '.xml'
Phylo.write(tree, treefname, 'phyloxml')
print >>sys.stderr, "Wrote", treefname
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment