Created
April 15, 2017 22:53
-
-
Save anonymous/2e32b4eeb5a85bbbc767720116e3edd0 to your computer and use it in GitHub Desktop.
Fast calculations of RDKit descriptors using coarse-grained parallelism.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from multiprocessing import Pool | |
| import pandas as pd | |
| from rdkit import Chem | |
| from rdkit.ML.Descriptors import MoleculeDescriptors | |
| import numpy as np | |
| import os | |
| def papply(df, func, n_jobs=None, n_partitions=10): | |
| pool = Pool(n_jobs) | |
| n_partitions *= pool._processes | |
| df_split = np.array_split(df, n_partitions) | |
| df = pd.concat(pool.map(func, df_split)) | |
| pool.close() | |
| pool.join() | |
| return df | |
| def get_2d_desc_list(smi='Nc1ccn2ncc(C(=O)Nc3cnoc3-c3cccc(Cl)c3)c2n1'): | |
| # all descriptors | |
| dl = Chem.Descriptors._descList | |
| # find aplicable for 2D mols | |
| dl_ok = list() | |
| mol = Chem.MolFromSmiles(smi) | |
| for name, fun in dl: | |
| try: | |
| fun(mol) | |
| dl_ok.append(name) | |
| except: | |
| pass | |
| return dl_ok | |
| RDKIT_2D = get_2d_desc_list() | |
| def calc_2d_desc(df, col='SMILES'): | |
| dl = RDKIT_2D | |
| calc = MoleculeDescriptors.MolecularDescriptorCalculator(dl) | |
| desc = df[col].apply(lambda x: calc.CalcDescriptors(Chem.MolFromSmiles(x))) | |
| for name, data in zip(dl, np.array([i for i in desc.values]).T): | |
| df[name] = data | |
| return df | |
| os.environ['OMP_NUM_THREADS'] = "1" | |
| df = papply(df, calc_2d_desc) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment