Skip to content

Instantly share code, notes, and snippets.

@cristianpjensen
Created October 17, 2025 12:05
Show Gist options
  • Select an option

  • Save cristianpjensen/16024f1d576423ec510acdcabb2b74bd to your computer and use it in GitHub Desktop.

Select an option

Save cristianpjensen/16024f1d576423ec510acdcabb2b74bd to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
PDB Splitter - Separate Protein and Ligand
This script takes a PDB file containing a protein-ligand complex and splits it into:
1. A clean protein structure (without the ligand)
2. The ligand as a separate PDB file
Both outputs are valid PDB files that can be used for docking, visualization, or analysis.
Usage:
python split_protein_ligand.py -p complex.pdb -l LIG
"""
import argparse
import sys
from pathlib import Path
def parse_arguments():
"""Parse command line arguments."""
parser = argparse.ArgumentParser(
description="Split protein-ligand complex into separate PDB files",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python split_protein_ligand.py -p 1hsg.pdb -l MK1
python split_protein_ligand.py -p complex.pdb -l ATP -o output_prefix
python split_protein_ligand.py -p structure.pdb -l HEM --keep-waters
""",
)
parser.add_argument(
"-p", "--protein", required=True, help="Input PDB file containing protein and ligand"
)
parser.add_argument(
"-l",
"--ligand",
required=True,
help="Residue name of the ligand to extract (e.g., MK1, ATP, HEM)",
)
parser.add_argument(
"-o",
"--output",
default=None,
help="Output prefix for files (default: input filename without extension)",
)
parser.add_argument(
"--keep-waters", action="store_true", help="Keep water molecules (HOH) in protein output"
)
parser.add_argument(
"--keep-hetero",
action="store_true",
help="Keep other HETATM records (ions, cofactors) in protein output",
)
parser.add_argument(
"--all-chains",
action="store_true",
help="Extract ligand from all chains (default: extracts from all chains)",
)
return parser.parse_args()
def read_pdb_file(filename):
"""Read PDB file and return lines."""
try:
with open(filename, "r") as f:
return f.readlines()
except FileNotFoundError:
print(f"Error: File '{filename}' not found.")
sys.exit(1)
except Exception as e:
print(f"Error reading file: {e}")
sys.exit(1)
def extract_ligand(pdb_lines, ligand_name):
"""
Extract all lines belonging to the specified ligand.
Returns
-------
tuple: (ligand_lines, ligand_found)
"""
ligand_lines = []
ligand_found = False
ligand_chains = set()
# First pass: collect ligand lines
for line in pdb_lines:
if line.startswith(("HETATM", "ATOM")):
resname = line[17:20].strip()
if resname == ligand_name:
ligand_found = True
ligand_lines.append(line)
# Track which chain this ligand is in
chain = line[21:22]
ligand_chains.add(chain)
# Also include CONECT records if they reference ligand atoms
elif line.startswith("CONECT") and ligand_found:
ligand_lines.append(line)
# Add header information
header_lines = []
header_lines.append(f"HEADER LIGAND {ligand_name}\n")
header_lines.append(f"REMARK Extracted ligand: {ligand_name}\n")
if ligand_chains:
header_lines.append(f"REMARK Found in chain(s): {', '.join(sorted(ligand_chains))}\n")
# Ensure proper PDB format
if ligand_lines and not ligand_lines[-1].startswith("END"):
ligand_lines.append("END\n")
return header_lines + ligand_lines, ligand_found, ligand_chains
def extract_protein(pdb_lines, ligand_name, keep_waters=False, keep_hetero=False):
"""
Extract protein without the specified ligand.
Returns
-------
list: protein lines without ligand
"""
protein_lines = []
for line in pdb_lines:
# Keep ATOM records (standard protein atoms)
if line.startswith("ATOM"):
protein_lines.append(line)
# Handle HETATM records
elif line.startswith("HETATM"):
resname = line[17:20].strip()
# Skip the target ligand
if resname == ligand_name:
continue
# Handle waters
if resname in ["HOH", "WAT"]:
if keep_waters:
protein_lines.append(line)
continue
# Handle other HETATM (ions, cofactors, etc.)
if keep_hetero:
protein_lines.append(line)
# Keep structural records
elif line.startswith(("CRYST1", "MODEL", "ENDMDL", "TER")):
protein_lines.append(line)
# Keep header information
elif line.startswith(
(
"HEADER",
"TITLE",
"COMPND",
"SOURCE",
"AUTHOR",
"REVDAT",
"REMARK",
"SEQRES",
"HELIX",
"SHEET",
)
):
protein_lines.append(line)
# Ensure proper ending
if protein_lines and not protein_lines[-1].startswith("END"):
protein_lines.append("END\n")
return protein_lines
def get_ligand_info(ligand_lines):
"""Get information about the extracted ligand."""
atom_count = 0
elements = set()
for line in ligand_lines:
if line.startswith(("HETATM", "ATOM")):
atom_count += 1
# Element is in columns 77-78, or can be inferred from atom name
if len(line) > 76:
element = line[76:78].strip()
if element:
elements.add(element)
return atom_count, elements
def write_pdb_file(lines, filename):
"""Write lines to PDB file."""
try:
with open(filename, "w") as f:
f.writelines(lines)
return True
except Exception as e:
print(f"Error writing {filename}: {e}")
return False
def main():
"""Main execution function."""
args = parse_arguments()
# Determine output prefix
if args.output:
output_prefix = args.output
else:
# Use input filename without extension
output_prefix = Path(args.protein).stem
print("=" * 70)
print("PDB SPLITTER - Separate Protein and Ligand")
print("=" * 70)
print(f"\nInput file: {args.protein}")
print(f"Target ligand: {args.ligand}")
print(f"Output prefix: {output_prefix}")
# Read PDB file
print("\n[1/3] Reading PDB file...")
pdb_lines = read_pdb_file(args.protein)
print(f"✓ Read {len(pdb_lines)} lines")
# Extract ligand
print(f"\n[2/3] Extracting ligand '{args.ligand}'...")
ligand_lines, ligand_found, ligand_chains = extract_ligand(pdb_lines, args.ligand)
if not ligand_found:
print(f"\nError: Ligand '{args.ligand}' not found in PDB file.")
print("\nAvailable residue names:")
unique_residues = set()
for line in pdb_lines:
if line.startswith(("HETATM", "ATOM")):
resname = line[17:20].strip()
unique_residues.add(resname)
# Separate standard amino acids from hetero compounds
standard_aa = {
"ALA",
"ARG",
"ASN",
"ASP",
"CYS",
"GLN",
"GLU",
"GLY",
"HIS",
"ILE",
"LEU",
"LYS",
"MET",
"PHE",
"PRO",
"SER",
"THR",
"TRP",
"TYR",
"VAL",
}
hetero_compounds = sorted(unique_residues - standard_aa - {"HOH", "WAT"})
waters = sorted(unique_residues & {"HOH", "WAT"})
if hetero_compounds:
print("\n Hetero compounds (possible ligands):")
for res in hetero_compounds:
print(f" - {res}")
if waters:
print("\n Waters:")
for res in waters:
print(f" - {res}")
print("\n Standard amino acids: (not shown)")
sys.exit(1)
atom_count, elements = get_ligand_info(ligand_lines)
print(f"✓ Found {atom_count} atoms in ligand")
if ligand_chains:
print(f"✓ Ligand present in chain(s): {', '.join(sorted(ligand_chains))}")
if elements:
print(f"✓ Elements: {', '.join(sorted(elements))}")
# Extract protein
print(f"\n[3/3] Extracting protein without '{args.ligand}'...")
protein_lines = extract_protein(pdb_lines, args.ligand, args.keep_waters, args.keep_hetero)
protein_atoms = sum(1 for line in protein_lines if line.startswith("ATOM"))
print(f"✓ Protein has {protein_atoms} atoms")
if not args.keep_waters:
print("✓ Removed water molecules")
if not args.keep_hetero:
print("✓ Removed other HETATM records")
# Write output files
print("\n" + "=" * 70)
print("Writing output files...")
print("=" * 70)
protein_file = f"{output_prefix}_protein.pdb"
ligand_file = f"{output_prefix}_ligand.pdb"
if write_pdb_file(protein_lines, protein_file):
print(f"\n✓ Protein saved to: {protein_file}")
print(f" Contains: {protein_atoms} atoms")
if write_pdb_file(ligand_lines, ligand_file):
print(f"\n✓ Ligand saved to: {ligand_file}")
print(f" Contains: {atom_count} atoms")
# Summary
print("\n" + "=" * 70)
print("✓ SPLITTING COMPLETE!")
print("=" * 70)
print("\nOutput files:")
print(f" 1. {protein_file} - Protein structure without ligand")
print(f" 2. {ligand_file} - Ligand only")
print("\nNext steps:")
print(" • Use protein file for receptor preparation (add H, convert to PDBQT)")
print(" • Use ligand file as reference or for ligand preparation")
print(" • Visualize both in PyMOL or other molecular viewer")
print("\nExample visualization in PyMOL:")
print(f" load {protein_file}")
print(f" load {ligand_file}")
print(" show cartoon, *protein")
print(" show sticks, *ligand")
print(" color green, *ligand")
print()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment