Created
October 17, 2025 12:05
-
-
Save cristianpjensen/16024f1d576423ec510acdcabb2b74bd to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| PDB Splitter - Separate Protein and Ligand | |
| This script takes a PDB file containing a protein-ligand complex and splits it into: | |
| 1. A clean protein structure (without the ligand) | |
| 2. The ligand as a separate PDB file | |
| Both outputs are valid PDB files that can be used for docking, visualization, or analysis. | |
| Usage: | |
| python split_protein_ligand.py -p complex.pdb -l LIG | |
| """ | |
| import argparse | |
| import sys | |
| from pathlib import Path | |
| def parse_arguments(): | |
| """Parse command line arguments.""" | |
| parser = argparse.ArgumentParser( | |
| description="Split protein-ligand complex into separate PDB files", | |
| formatter_class=argparse.RawDescriptionHelpFormatter, | |
| epilog=""" | |
| Examples: | |
| python split_protein_ligand.py -p 1hsg.pdb -l MK1 | |
| python split_protein_ligand.py -p complex.pdb -l ATP -o output_prefix | |
| python split_protein_ligand.py -p structure.pdb -l HEM --keep-waters | |
| """, | |
| ) | |
| parser.add_argument( | |
| "-p", "--protein", required=True, help="Input PDB file containing protein and ligand" | |
| ) | |
| parser.add_argument( | |
| "-l", | |
| "--ligand", | |
| required=True, | |
| help="Residue name of the ligand to extract (e.g., MK1, ATP, HEM)", | |
| ) | |
| parser.add_argument( | |
| "-o", | |
| "--output", | |
| default=None, | |
| help="Output prefix for files (default: input filename without extension)", | |
| ) | |
| parser.add_argument( | |
| "--keep-waters", action="store_true", help="Keep water molecules (HOH) in protein output" | |
| ) | |
| parser.add_argument( | |
| "--keep-hetero", | |
| action="store_true", | |
| help="Keep other HETATM records (ions, cofactors) in protein output", | |
| ) | |
| parser.add_argument( | |
| "--all-chains", | |
| action="store_true", | |
| help="Extract ligand from all chains (default: extracts from all chains)", | |
| ) | |
| return parser.parse_args() | |
| def read_pdb_file(filename): | |
| """Read PDB file and return lines.""" | |
| try: | |
| with open(filename, "r") as f: | |
| return f.readlines() | |
| except FileNotFoundError: | |
| print(f"Error: File '{filename}' not found.") | |
| sys.exit(1) | |
| except Exception as e: | |
| print(f"Error reading file: {e}") | |
| sys.exit(1) | |
| def extract_ligand(pdb_lines, ligand_name): | |
| """ | |
| Extract all lines belonging to the specified ligand. | |
| Returns | |
| ------- | |
| tuple: (ligand_lines, ligand_found) | |
| """ | |
| ligand_lines = [] | |
| ligand_found = False | |
| ligand_chains = set() | |
| # First pass: collect ligand lines | |
| for line in pdb_lines: | |
| if line.startswith(("HETATM", "ATOM")): | |
| resname = line[17:20].strip() | |
| if resname == ligand_name: | |
| ligand_found = True | |
| ligand_lines.append(line) | |
| # Track which chain this ligand is in | |
| chain = line[21:22] | |
| ligand_chains.add(chain) | |
| # Also include CONECT records if they reference ligand atoms | |
| elif line.startswith("CONECT") and ligand_found: | |
| ligand_lines.append(line) | |
| # Add header information | |
| header_lines = [] | |
| header_lines.append(f"HEADER LIGAND {ligand_name}\n") | |
| header_lines.append(f"REMARK Extracted ligand: {ligand_name}\n") | |
| if ligand_chains: | |
| header_lines.append(f"REMARK Found in chain(s): {', '.join(sorted(ligand_chains))}\n") | |
| # Ensure proper PDB format | |
| if ligand_lines and not ligand_lines[-1].startswith("END"): | |
| ligand_lines.append("END\n") | |
| return header_lines + ligand_lines, ligand_found, ligand_chains | |
| def extract_protein(pdb_lines, ligand_name, keep_waters=False, keep_hetero=False): | |
| """ | |
| Extract protein without the specified ligand. | |
| Returns | |
| ------- | |
| list: protein lines without ligand | |
| """ | |
| protein_lines = [] | |
| for line in pdb_lines: | |
| # Keep ATOM records (standard protein atoms) | |
| if line.startswith("ATOM"): | |
| protein_lines.append(line) | |
| # Handle HETATM records | |
| elif line.startswith("HETATM"): | |
| resname = line[17:20].strip() | |
| # Skip the target ligand | |
| if resname == ligand_name: | |
| continue | |
| # Handle waters | |
| if resname in ["HOH", "WAT"]: | |
| if keep_waters: | |
| protein_lines.append(line) | |
| continue | |
| # Handle other HETATM (ions, cofactors, etc.) | |
| if keep_hetero: | |
| protein_lines.append(line) | |
| # Keep structural records | |
| elif line.startswith(("CRYST1", "MODEL", "ENDMDL", "TER")): | |
| protein_lines.append(line) | |
| # Keep header information | |
| elif line.startswith( | |
| ( | |
| "HEADER", | |
| "TITLE", | |
| "COMPND", | |
| "SOURCE", | |
| "AUTHOR", | |
| "REVDAT", | |
| "REMARK", | |
| "SEQRES", | |
| "HELIX", | |
| "SHEET", | |
| ) | |
| ): | |
| protein_lines.append(line) | |
| # Ensure proper ending | |
| if protein_lines and not protein_lines[-1].startswith("END"): | |
| protein_lines.append("END\n") | |
| return protein_lines | |
| def get_ligand_info(ligand_lines): | |
| """Get information about the extracted ligand.""" | |
| atom_count = 0 | |
| elements = set() | |
| for line in ligand_lines: | |
| if line.startswith(("HETATM", "ATOM")): | |
| atom_count += 1 | |
| # Element is in columns 77-78, or can be inferred from atom name | |
| if len(line) > 76: | |
| element = line[76:78].strip() | |
| if element: | |
| elements.add(element) | |
| return atom_count, elements | |
| def write_pdb_file(lines, filename): | |
| """Write lines to PDB file.""" | |
| try: | |
| with open(filename, "w") as f: | |
| f.writelines(lines) | |
| return True | |
| except Exception as e: | |
| print(f"Error writing {filename}: {e}") | |
| return False | |
| def main(): | |
| """Main execution function.""" | |
| args = parse_arguments() | |
| # Determine output prefix | |
| if args.output: | |
| output_prefix = args.output | |
| else: | |
| # Use input filename without extension | |
| output_prefix = Path(args.protein).stem | |
| print("=" * 70) | |
| print("PDB SPLITTER - Separate Protein and Ligand") | |
| print("=" * 70) | |
| print(f"\nInput file: {args.protein}") | |
| print(f"Target ligand: {args.ligand}") | |
| print(f"Output prefix: {output_prefix}") | |
| # Read PDB file | |
| print("\n[1/3] Reading PDB file...") | |
| pdb_lines = read_pdb_file(args.protein) | |
| print(f"✓ Read {len(pdb_lines)} lines") | |
| # Extract ligand | |
| print(f"\n[2/3] Extracting ligand '{args.ligand}'...") | |
| ligand_lines, ligand_found, ligand_chains = extract_ligand(pdb_lines, args.ligand) | |
| if not ligand_found: | |
| print(f"\nError: Ligand '{args.ligand}' not found in PDB file.") | |
| print("\nAvailable residue names:") | |
| unique_residues = set() | |
| for line in pdb_lines: | |
| if line.startswith(("HETATM", "ATOM")): | |
| resname = line[17:20].strip() | |
| unique_residues.add(resname) | |
| # Separate standard amino acids from hetero compounds | |
| standard_aa = { | |
| "ALA", | |
| "ARG", | |
| "ASN", | |
| "ASP", | |
| "CYS", | |
| "GLN", | |
| "GLU", | |
| "GLY", | |
| "HIS", | |
| "ILE", | |
| "LEU", | |
| "LYS", | |
| "MET", | |
| "PHE", | |
| "PRO", | |
| "SER", | |
| "THR", | |
| "TRP", | |
| "TYR", | |
| "VAL", | |
| } | |
| hetero_compounds = sorted(unique_residues - standard_aa - {"HOH", "WAT"}) | |
| waters = sorted(unique_residues & {"HOH", "WAT"}) | |
| if hetero_compounds: | |
| print("\n Hetero compounds (possible ligands):") | |
| for res in hetero_compounds: | |
| print(f" - {res}") | |
| if waters: | |
| print("\n Waters:") | |
| for res in waters: | |
| print(f" - {res}") | |
| print("\n Standard amino acids: (not shown)") | |
| sys.exit(1) | |
| atom_count, elements = get_ligand_info(ligand_lines) | |
| print(f"✓ Found {atom_count} atoms in ligand") | |
| if ligand_chains: | |
| print(f"✓ Ligand present in chain(s): {', '.join(sorted(ligand_chains))}") | |
| if elements: | |
| print(f"✓ Elements: {', '.join(sorted(elements))}") | |
| # Extract protein | |
| print(f"\n[3/3] Extracting protein without '{args.ligand}'...") | |
| protein_lines = extract_protein(pdb_lines, args.ligand, args.keep_waters, args.keep_hetero) | |
| protein_atoms = sum(1 for line in protein_lines if line.startswith("ATOM")) | |
| print(f"✓ Protein has {protein_atoms} atoms") | |
| if not args.keep_waters: | |
| print("✓ Removed water molecules") | |
| if not args.keep_hetero: | |
| print("✓ Removed other HETATM records") | |
| # Write output files | |
| print("\n" + "=" * 70) | |
| print("Writing output files...") | |
| print("=" * 70) | |
| protein_file = f"{output_prefix}_protein.pdb" | |
| ligand_file = f"{output_prefix}_ligand.pdb" | |
| if write_pdb_file(protein_lines, protein_file): | |
| print(f"\n✓ Protein saved to: {protein_file}") | |
| print(f" Contains: {protein_atoms} atoms") | |
| if write_pdb_file(ligand_lines, ligand_file): | |
| print(f"\n✓ Ligand saved to: {ligand_file}") | |
| print(f" Contains: {atom_count} atoms") | |
| # Summary | |
| print("\n" + "=" * 70) | |
| print("✓ SPLITTING COMPLETE!") | |
| print("=" * 70) | |
| print("\nOutput files:") | |
| print(f" 1. {protein_file} - Protein structure without ligand") | |
| print(f" 2. {ligand_file} - Ligand only") | |
| print("\nNext steps:") | |
| print(" • Use protein file for receptor preparation (add H, convert to PDBQT)") | |
| print(" • Use ligand file as reference or for ligand preparation") | |
| print(" • Visualize both in PyMOL or other molecular viewer") | |
| print("\nExample visualization in PyMOL:") | |
| print(f" load {protein_file}") | |
| print(f" load {ligand_file}") | |
| print(" show cartoon, *protein") | |
| print(" show sticks, *ligand") | |
| print(" color green, *ligand") | |
| print() | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment