Created
April 6, 2011 22:16
-
-
Save xydrolase/906658 to your computer and use it in GitHub Desktop.
A PyMOL extension script for building mixed coarse-grained model PDB files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| """ mixedcg.py | |
| A PyMOL extension script for building mixed coarse grained model. | |
| Author: Xin Yin <xinyin at iastate dot edu> | |
| Biopython dependencies: | |
| This script requires Biopython module for communicating with NCBI | |
| server. You should first install Biopython onto your computer, and then | |
| copy the Bio directory installed at | |
| /path/to/python/lib/site-packages/ | |
| (or create a symbolic link of the Bio directory) to | |
| /path/to/pymol/ext/lib/site-packages | |
| Try | |
| > import Bio | |
| in PyMOL to examine if you have correctly installed Biopython (for | |
| PyMOL). | |
| Example usage: | |
| In this example we are intended to prepare PDB files for MAVEN to run | |
| mixed coarse-grained ANM, say the source PDB entry is 1A7S. | |
| > fetch 1a7s | |
| > run /path/to/mixedcg.py # this step loads this script and registered | |
| # the cmd.mixed_cg() method | |
| > cmd.mixed_cg('1a7s', # name of structure | |
| around=10, # use 10 angstroms as cutoff radius | |
| # for the PyMOL "object around radius" | |
| # selector | |
| byres=True, # Use by residue selecting mode | |
| save=True # Save the PDB files immediately | |
| ) | |
| After running this, you will have 4 new files, namely, | |
| 1a7s_(fine|coarse)_grained.pdb | |
| 1a7s_mixed_cg.pdb | |
| 1a7s_model.txt | |
| You can then import 1a7s_mixed_cg.pdb to MAVEN for running mixed-cg ANM. | |
| 1a7s_model.txt contains the the number of atoms in each PDB file you | |
| sent to concatenate and also the atom indices for the active site | |
| residues that you will need for motion comparisons. | |
| Example: | |
| Atoms in each file: | |
| 1nes_coarse_grained.pdb - 180 | |
| 1nes_fine_grained.pdb - 460 | |
| Atoms in active site residues: | |
| 263:272,353:357,476:480 | |
| You will need all these numbers for running ANM and the dynamics | |
| comparison. | |
| -------- | |
| For PDB entry 1NES, the annotated active sites from NCBI are incorrect. | |
| You can then explicitly specify the active sites by passing the | |
| argument `active_sites` to cmd.mixed_cg(). | |
| Also, 1NES PDB file has 3 chains, namely chain E, I, J, which are the | |
| enzyme and two ligands respectively. The script will identify the | |
| longest chain as the protein for simulation. In cases you want to have | |
| fine grained atoms to be selected around the ligands rather than the active | |
| sites, you can use additional parameter `ligand_chain_id`. | |
| > cmd.mixed_cg('1nes', ligand_chain_id='IJ', | |
| active_sites=[57, 102, 195], save=True) | |
| Note that some parameters are omitted for the default values. | |
| Issues: | |
| I have not tried this on dimers or tetramers. Behavior for running this | |
| script on polymers might be odd. | |
| """ | |
| from Bio import Entrez | |
| import os | |
| import sys | |
| import urllib | |
| import re | |
| # Identify yourself with the NCBI server | |
| Entrez.email = "example@abc.com" | |
| class EntrezRetriever: | |
| def __init__(self, db='gene'): | |
| self.db = db | |
| self.search_feature_header = re.compile(' {5}[A-za-z]+').search | |
| self.search_active_site_type = re.compile(r'/site_type="active').search | |
| self.search_active_sites = re.compile(r'order\((.+)\)').search | |
| def search_term(self, term, max_results=10): | |
| handle = Entrez.esearch(self.db, retmax=max_results, term=term) | |
| try: | |
| record = Entrez.read(handle) | |
| except RuntimeError, e: | |
| print "An error occurred during searching database." | |
| print "ERROR: %s" % e | |
| sys.exit(1) | |
| if record['Count'] > 0: | |
| return record['IdList'] | |
| else: | |
| return [] | |
| def retrieve_annotation(self, id_list): | |
| """Annotates Entrez Gene IDs using Bio.Entrez, in particular epost (to | |
| submit the data to NCBI) and esummary to retrieve the information. | |
| Returns a list of dictionaries with the annotations.""" | |
| request = Entrez.epost(self.db, id=",".join([str(id) for id in id_list])) | |
| try: | |
| result = Entrez.read(request) | |
| except RuntimeError, e: | |
| #FIXME: How generate NAs instead of causing an error with invalid IDs? | |
| print "An error occurred while retrieving the annotations." | |
| print "The error returned was %s" % e | |
| sys.exit(-1) | |
| webEnv = result["WebEnv"] | |
| queryKey = result["QueryKey"] | |
| data = Entrez.esummary(db=self.db, webenv=webEnv, query_key = queryKey) | |
| annotations = Entrez.read(data) | |
| return annotations | |
| def retrieve_features(self, id, feature): | |
| raw_metadata = self.strip_tags(urllib.urlopen( | |
| 'http://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?val=%(id)s&db=\ | |
| %(db)s&retmode=html' % {'id' : id, 'db': self.db}).read()).splitlines(True) | |
| lines_with_headers = [lineno for lineno in xrange(len(raw_metadata)) | |
| if self.search_feature_header(raw_metadata[lineno])] | |
| lines_with_features = filter( | |
| lambda lineno: raw_metadata[lineno].startswith(' %s' % feature), | |
| lines_with_headers) | |
| features_selected = [] | |
| for site_lineno in lines_with_features: | |
| index = lines_with_headers.index(site_lineno) | |
| if index < len(raw_metadata) - 1: | |
| features_selected.append(raw_metadata[lines_with_headers[index]:\ | |
| lines_with_headers[index+1]]) | |
| elif index == len(raw_metadata): | |
| features_selected(raw_metadata[lines_with_features[index]:]) | |
| return features_selected | |
| def find_active_sites(self, all_sites, offset): | |
| # Only return the first one? | |
| for site in all_sites: | |
| if self.search_active_site_type(site[1]): | |
| return map(lambda acs: | |
| int(acs)+offset, | |
| self.search_active_sites(site[0]).group(1).split(',')) | |
| return None | |
| def strip_tags(self, html): | |
| return re.sub('</?[^<>]+?>', '', html) | |
| def construct_mixed_cg(object_id, around_with, selector_peptide, radius, byres): | |
| cmd.select('%s_fine_grained' % object_id, | |
| '((%(object)s and %(byres)s %(object)s_%(with)s \ | |
| around %(radius)d) and %(chain)s) and not resn HOH+SO4' % { | |
| 'object': object_id, | |
| 'chain': selector_peptide, | |
| 'with': around_with, | |
| 'radius': radius, | |
| 'byres': ['', 'byres'][byres] | |
| }) | |
| cmd.select('%s_coarse_grained' % object_id, | |
| '%(object)s and %(chain)s and not (%(object)s_%(with)s \ | |
| or %(object)s_fine_grained) and name CA' % { | |
| 'object': object_id, | |
| 'with': around_with, | |
| 'chain': selector_peptide, | |
| }) | |
| # Now try to re-visualize the molecule | |
| cmd.hide('everything', object_id) | |
| for color, shape, parts in [('red', 'sticks', around_with), | |
| ('marine', 'sticks', 'fine_grained'), | |
| ('gray', 'spheres', 'coarse_grained')]: | |
| selector = '%(object)s_%(parts)s' % { | |
| 'object': object_id, | |
| 'parts': parts | |
| } | |
| cmd.color(color, selector) | |
| cmd.show(shape, selector) | |
| def save_model(object_id, around_with, path=''): | |
| """Save the mixed cg model to PDB files at given path.""" | |
| for suffix in ['coarse_grained', 'fine_grained']: | |
| pdb = os.path.join(path, | |
| '%(object)s_%(suffix)s.pdb' % { | |
| 'object': object_id, | |
| 'suffix': suffix | |
| }) | |
| if suffix == 'fine_grained': | |
| selector = '%(object)s_fine_grained or %(object)s_%(with)s' % { | |
| 'object': object_id, | |
| 'with': around_with | |
| } | |
| else: | |
| selector = '%s_coarse_grained' % object_id | |
| cmd.save(pdb, selector) | |
| def concatenate_pdbs(pdbs, active_sites, path=''): | |
| pdbs = [os.path.join(path, | |
| [''.join((f, '.pdb')), f][f.endswith('.pdb')]) | |
| for f in pdbs] | |
| object_id = [f[:f.find('_')] for f in pdbs][0] | |
| atom_serial = 1 | |
| prev_serial = 1 | |
| active_site_atoms = [] | |
| active_sites = map(str, active_sites) # for string comparison | |
| atom_in_files = [] | |
| output = [] | |
| for coords_file in pdbs: | |
| if os.path.exists(coords_file): | |
| records = open(coords_file).read().splitlines(False) | |
| for rec in records: | |
| if rec[:6] in ('ATOM ', 'HETATM'): | |
| # Split coordinate record into non-trivial data and spaces | |
| # so we can deal with specific columns. | |
| coord_columns = re.split(r'\s+', rec) | |
| coord_spaces = re.findall(r'\s+', rec) | |
| # We will now reset all atom indices to sequential order. | |
| # To maintain the alignment of the coordinate records, we | |
| # need to compensate the alternations in atom indices by | |
| # adjusting spaces | |
| space_offset = len(coord_columns[1]) - len(str(atom_serial)) | |
| coord_spaces[0] = ' ' * (len(coord_spaces[0]) + space_offset) | |
| # Reset the serial | |
| coord_columns[1] = str(atom_serial) | |
| # We need to record all the (modified) atom indices that | |
| # corresponds to active site residues, which will be | |
| # compared later. | |
| if coord_columns[5] in active_sites: | |
| active_site_atoms.append(atom_serial) | |
| atom_serial += 1 | |
| # Regroup all data back into string | |
| coord_all = [] | |
| for col, space in zip(coord_columns, coord_spaces): | |
| coord_all.extend([col, space]) | |
| output.append(''.join(coord_all)) | |
| atom_in_files.append('%s - %d' % ( | |
| coords_file, atom_serial - prev_serial | |
| )) | |
| prev_serial = atom_serial | |
| fd = open(os.path.join(path, '%s_mixed_cg.pdb' % object_id), 'w+') | |
| fd.write('\n'.join(output)) | |
| fd.close() | |
| # Try to represent the indices for atoms in MATLAB compatible vector | |
| if active_site_atoms: | |
| _atoms = [] | |
| atom_count = len(active_site_atoms) - 1 | |
| reduce(lambda x, y: _atoms.append( | |
| [str(active_site_atoms[x]), '-'][y < atom_count and\ | |
| active_site_atoms[y]-active_site_atoms[x]==1]) or y, | |
| xrange(len(active_site_atoms))) | |
| vec_atoms = re.sub('(-+)(\d+)', | |
| lambda x: "%d:%s," % ( | |
| int(x.group(2))-len(x.group(1)), | |
| x.group(2)), | |
| ''.join(_atoms))[:-1] | |
| else: | |
| vec_atoms = '' | |
| fd = open(os.path.join(path, '%s_model.txt' % object_id), 'w+') | |
| fd.write("Atoms in each file:\n" + '\n'.join(atom_in_files) +\ | |
| '\n\nAtoms in active site residues:\n') | |
| fd.write(vec_atoms) | |
| fd.close() | |
| def mixed_cg(object_id, ligand_chain_id=None, active_sites=None, | |
| around=10, offset=0, byres=False, save=False): | |
| entrezer = EntrezRetriever(db='protein') | |
| chains = sorted([(len(cmd.get_model('%(object)s and chain %(chain)s' % | |
| {'object': object_id, 'chain': chain_id}).atom), chain_id) | |
| for chain_id in cmd.get_chains(object_id)], | |
| lambda x, y: cmp(y[0], x[0])) | |
| selector_peptide = 'chain %s' % chains[0][1] | |
| entrez_protein_id = '%(object)s_%(chain)s' % { | |
| 'object': object_id, 'chain': chains[0][1]} | |
| if not active_sites: | |
| print 'Searching NCBI using', entrez_protein_id | |
| id_list = entrezer.search_term(entrez_protein_id) | |
| if id_list: | |
| # Grab all annotations metadata categorized as functional sites | |
| print 'Found protein entry', id_list[0] | |
| active_sites = entrezer.find_active_sites( | |
| entrezer.retrieve_features(id_list[0], 'Site'), offset) | |
| if active_sites: | |
| print 'Found active sites', \ | |
| ','.join(map(str, active_sites)), 'for', object_id | |
| if not ligand_chain_id and active_sites: | |
| # Active sites found, try build mixed cg model by selecting atoms | |
| # close to proximity of active sites as fine grained atoms. | |
| cmd.select('%s_active_sites' % object_id, | |
| '%(object)s and %(chain)s and resi %(residues)s' % { | |
| 'object': object_id, | |
| 'chain': selector_peptide, | |
| 'residues': '+'.join(map(str, active_sites)) | |
| }) | |
| construct_mixed_cg(object_id, 'active_sites', | |
| selector_peptide, around, byres) | |
| else: | |
| # User specified chain(s) as ligand(s), build mixed cg model based on | |
| # that. | |
| selector_ligands = ' or '.join(map(lambda arg: 'chain %s' % arg, | |
| list(ligand_chain_id))) | |
| ligands = cmd.select('%s_ligands' % object_id, | |
| '%(object)s and %(ligand_chains)s and not resn HOH+SO4' % { | |
| 'object': object_id, | |
| 'ligand_chains': selector_ligands | |
| }) | |
| construct_mixed_cg(object_id, 'ligands', selector_peptide, around, | |
| byres) | |
| if save: | |
| # Save the PDB files to current directory, and perform concatenation | |
| save_model(object_id, | |
| ['ligands', 'active_sites'][ligand_chain_id == None]) | |
| concatenate_pdbs(('%s_coarse_grained.pdb' % object_id, | |
| '%s_fine_grained.pdb' % object_id), | |
| active_sites) | |
| print 'Mixed CG model PDB files saved to your current directory.' | |
| # Insert the function into cmd namespace | |
| # NOTE: We avoid using cmd.extend() functionality here as suggested by PyMOL | |
| # scripting tutorial. Calling cmd.mixed_cg() brings more flexibility in using | |
| # optional parameters and consistency in avoiding weird errors. (As PyMOL | |
| # treats every argument, even those supposed to be integers, as strings) | |
| cmd.mixed_cg = mixed_cg |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment