Skip to content

Instantly share code, notes, and snippets.

Created April 15, 2017 22:53
Show Gist options
  • Select an option

  • Save anonymous/2e32b4eeb5a85bbbc767720116e3edd0 to your computer and use it in GitHub Desktop.

Select an option

Save anonymous/2e32b4eeb5a85bbbc767720116e3edd0 to your computer and use it in GitHub Desktop.
Fast calculations of RDKit descriptors using coarse-grained parallelism.
from multiprocessing import Pool
import pandas as pd
from rdkit import Chem
from rdkit.ML.Descriptors import MoleculeDescriptors
import numpy as np
import os
def papply(df, func, n_jobs=None, n_partitions=10):
pool = Pool(n_jobs)
n_partitions *= pool._processes
df_split = np.array_split(df, n_partitions)
df = pd.concat(pool.map(func, df_split))
pool.close()
pool.join()
return df
def get_2d_desc_list(smi='Nc1ccn2ncc(C(=O)Nc3cnoc3-c3cccc(Cl)c3)c2n1'):
# all descriptors
dl = Chem.Descriptors._descList
# find aplicable for 2D mols
dl_ok = list()
mol = Chem.MolFromSmiles(smi)
for name, fun in dl:
try:
fun(mol)
dl_ok.append(name)
except:
pass
return dl_ok
RDKIT_2D = get_2d_desc_list()
def calc_2d_desc(df, col='SMILES'):
dl = RDKIT_2D
calc = MoleculeDescriptors.MolecularDescriptorCalculator(dl)
desc = df[col].apply(lambda x: calc.CalcDescriptors(Chem.MolFromSmiles(x)))
for name, data in zip(dl, np.array([i for i in desc.values]).T):
df[name] = data
return df
os.environ['OMP_NUM_THREADS'] = "1"
df = papply(df, calc_2d_desc)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment