cristianpjensen/split_protein_ligand.py

## split_protein_ligand.py
#!/usr/bin/env python3
"""
PDB Splitter - Separate Protein and Ligand

This script takes a PDB file containing a protein-ligand complex and splits it into:
1. A clean protein structure (without the ligand)
2. The ligand as a separate PDB file

Both outputs are valid PDB files that can be used for docking, visualization, or analysis.

Usage:
    python split_protein_ligand.py -p complex.pdb -l LIG
"""

import argparse
import sys
from pathlib import Path


def parse_arguments():
    """Parse command line arguments."""
    parser = argparse.ArgumentParser(
        description="Split protein-ligand complex into separate PDB files",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
    python split_protein_ligand.py -p 1hsg.pdb -l MK1
    python split_protein_ligand.py -p complex.pdb -l ATP -o output_prefix
    python split_protein_ligand.py -p structure.pdb -l HEM --keep-waters
        """,
    )

    parser.add_argument(
        "-p", "--protein", required=True, help="Input PDB file containing protein and ligand"
    )

    parser.add_argument(
        "-l",
        "--ligand",
        required=True,
        help="Residue name of the ligand to extract (e.g., MK1, ATP, HEM)",
    )

    parser.add_argument(
        "-o",
        "--output",
        default=None,
        help="Output prefix for files (default: input filename without extension)",
    )

    parser.add_argument(
        "--keep-waters", action="store_true", help="Keep water molecules (HOH) in protein output"
    )

    parser.add_argument(
        "--keep-hetero",
        action="store_true",
        help="Keep other HETATM records (ions, cofactors) in protein output",
    )

    parser.add_argument(
        "--all-chains",
        action="store_true",
        help="Extract ligand from all chains (default: extracts from all chains)",
    )

    return parser.parse_args()


def read_pdb_file(filename):
    """Read PDB file and return lines."""
    try:
        with open(filename, "r") as f:
            return f.readlines()
    except FileNotFoundError:
        print(f"Error: File '{filename}' not found.")
        sys.exit(1)
    except Exception as e:
        print(f"Error reading file: {e}")
        sys.exit(1)


def extract_ligand(pdb_lines, ligand_name):
    """
    Extract all lines belonging to the specified ligand.

    Returns
    -------
        tuple: (ligand_lines, ligand_found)
    """
    ligand_lines = []
    ligand_found = False
    ligand_chains = set()

    # First pass: collect ligand lines
    for line in pdb_lines:
        if line.startswith(("HETATM", "ATOM")):
            resname = line[17:20].strip()
            if resname == ligand_name:
                ligand_found = True
                ligand_lines.append(line)
                # Track which chain this ligand is in
                chain = line[21:22]
                ligand_chains.add(chain)
        # Also include CONECT records if they reference ligand atoms
        elif line.startswith("CONECT") and ligand_found:
            ligand_lines.append(line)

    # Add header information
    header_lines = []
    header_lines.append(f"HEADER    LIGAND {ligand_name}\n")
    header_lines.append(f"REMARK   Extracted ligand: {ligand_name}\n")
    if ligand_chains:
        header_lines.append(f"REMARK   Found in chain(s): {', '.join(sorted(ligand_chains))}\n")

    # Ensure proper PDB format
    if ligand_lines and not ligand_lines[-1].startswith("END"):
        ligand_lines.append("END\n")

    return header_lines + ligand_lines, ligand_found, ligand_chains


def extract_protein(pdb_lines, ligand_name, keep_waters=False, keep_hetero=False):
    """
    Extract protein without the specified ligand.

    Returns
    -------
        list: protein lines without ligand
    """
    protein_lines = []

    for line in pdb_lines:
        # Keep ATOM records (standard protein atoms)
        if line.startswith("ATOM"):
            protein_lines.append(line)

        # Handle HETATM records
        elif line.startswith("HETATM"):
            resname = line[17:20].strip()

            # Skip the target ligand
            if resname == ligand_name:
                continue

            # Handle waters
            if resname in ["HOH", "WAT"]:
                if keep_waters:
                    protein_lines.append(line)
                continue

            # Handle other HETATM (ions, cofactors, etc.)
            if keep_hetero:
                protein_lines.append(line)

        # Keep structural records
        elif line.startswith(("CRYST1", "MODEL", "ENDMDL", "TER")):
            protein_lines.append(line)

        # Keep header information
        elif line.startswith(
            (
                "HEADER",
                "TITLE",
                "COMPND",
                "SOURCE",
                "AUTHOR",
                "REVDAT",
                "REMARK",
                "SEQRES",
                "HELIX",
                "SHEET",
            )
        ):
            protein_lines.append(line)

    # Ensure proper ending
    if protein_lines and not protein_lines[-1].startswith("END"):
        protein_lines.append("END\n")

    return protein_lines


def get_ligand_info(ligand_lines):
    """Get information about the extracted ligand."""
    atom_count = 0
    elements = set()

    for line in ligand_lines:
        if line.startswith(("HETATM", "ATOM")):
            atom_count += 1
            # Element is in columns 77-78, or can be inferred from atom name
            if len(line) > 76:
                element = line[76:78].strip()
                if element:
                    elements.add(element)

    return atom_count, elements


def write_pdb_file(lines, filename):
    """Write lines to PDB file."""
    try:
        with open(filename, "w") as f:
            f.writelines(lines)
        return True
    except Exception as e:
        print(f"Error writing {filename}: {e}")
        return False


def main():
    """Main execution function."""
    args = parse_arguments()

    # Determine output prefix
    if args.output:
        output_prefix = args.output
    else:
        # Use input filename without extension
        output_prefix = Path(args.protein).stem

    print("=" * 70)
    print("PDB SPLITTER - Separate Protein and Ligand")
    print("=" * 70)
    print(f"\nInput file: {args.protein}")
    print(f"Target ligand: {args.ligand}")
    print(f"Output prefix: {output_prefix}")

    # Read PDB file
    print("\n[1/3] Reading PDB file...")
    pdb_lines = read_pdb_file(args.protein)
    print(f"✓ Read {len(pdb_lines)} lines")

    # Extract ligand
    print(f"\n[2/3] Extracting ligand '{args.ligand}'...")
    ligand_lines, ligand_found, ligand_chains = extract_ligand(pdb_lines, args.ligand)

    if not ligand_found:
        print(f"\nError: Ligand '{args.ligand}' not found in PDB file.")
        print("\nAvailable residue names:")
        unique_residues = set()
        for line in pdb_lines:
            if line.startswith(("HETATM", "ATOM")):
                resname = line[17:20].strip()
                unique_residues.add(resname)

        # Separate standard amino acids from hetero compounds
        standard_aa = {
            "ALA",
            "ARG",
            "ASN",
            "ASP",
            "CYS",
            "GLN",
            "GLU",
            "GLY",
            "HIS",
            "ILE",
            "LEU",
            "LYS",
            "MET",
            "PHE",
            "PRO",
            "SER",
            "THR",
            "TRP",
            "TYR",
            "VAL",
        }

        hetero_compounds = sorted(unique_residues - standard_aa - {"HOH", "WAT"})
        waters = sorted(unique_residues & {"HOH", "WAT"})

        if hetero_compounds:
            print("\n  Hetero compounds (possible ligands):")
            for res in hetero_compounds:
                print(f"    - {res}")

        if waters:
            print("\n  Waters:")
            for res in waters:
                print(f"    - {res}")

        print("\n  Standard amino acids: (not shown)")
        sys.exit(1)

    atom_count, elements = get_ligand_info(ligand_lines)
    print(f"✓ Found {atom_count} atoms in ligand")
    if ligand_chains:
        print(f"✓ Ligand present in chain(s): {', '.join(sorted(ligand_chains))}")
    if elements:
        print(f"✓ Elements: {', '.join(sorted(elements))}")

    # Extract protein
    print(f"\n[3/3] Extracting protein without '{args.ligand}'...")
    protein_lines = extract_protein(pdb_lines, args.ligand, args.keep_waters, args.keep_hetero)

    protein_atoms = sum(1 for line in protein_lines if line.startswith("ATOM"))
    print(f"✓ Protein has {protein_atoms} atoms")

    if not args.keep_waters:
        print("✓ Removed water molecules")
    if not args.keep_hetero:
        print("✓ Removed other HETATM records")

    # Write output files
    print("\n" + "=" * 70)
    print("Writing output files...")
    print("=" * 70)

    protein_file = f"{output_prefix}_protein.pdb"
    ligand_file = f"{output_prefix}_ligand.pdb"

    if write_pdb_file(protein_lines, protein_file):
        print(f"\n✓ Protein saved to: {protein_file}")
        print(f"  Contains: {protein_atoms} atoms")

    if write_pdb_file(ligand_lines, ligand_file):
        print(f"\n✓ Ligand saved to: {ligand_file}")
        print(f"  Contains: {atom_count} atoms")

    # Summary
    print("\n" + "=" * 70)
    print("✓ SPLITTING COMPLETE!")
    print("=" * 70)
    print("\nOutput files:")
    print(f"  1. {protein_file} - Protein structure without ligand")
    print(f"  2. {ligand_file} - Ligand only")

    print("\nNext steps:")
    print("  • Use protein file for receptor preparation (add H, convert to PDBQT)")
    print("  • Use ligand file as reference or for ligand preparation")
    print("  • Visualize both in PyMOL or other molecular viewer")

    print("\nExample visualization in PyMOL:")
    print(f"  load {protein_file}")
    print(f"  load {ligand_file}")
    print("  show cartoon, *protein")
    print("  show sticks, *ligand")
    print("  color green, *ligand")
    print()


if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	PDB Splitter - Separate Protein and Ligand

	This script takes a PDB file containing a protein-ligand complex and splits it into:
	1. A clean protein structure (without the ligand)
	2. The ligand as a separate PDB file

	Both outputs are valid PDB files that can be used for docking, visualization, or analysis.

	Usage:
	python split_protein_ligand.py -p complex.pdb -l LIG
	"""

	import argparse
	import sys
	from pathlib import Path


	def parse_arguments():
	"""Parse command line arguments."""
	parser = argparse.ArgumentParser(
	description="Split protein-ligand complex into separate PDB files",
	formatter_class=argparse.RawDescriptionHelpFormatter,
	epilog="""
	Examples:
	python split_protein_ligand.py -p 1hsg.pdb -l MK1
	python split_protein_ligand.py -p complex.pdb -l ATP -o output_prefix
	python split_protein_ligand.py -p structure.pdb -l HEM --keep-waters
	""",
	)

	parser.add_argument(
	"-p", "--protein", required=True, help="Input PDB file containing protein and ligand"
	)

	parser.add_argument(
	"-l",
	"--ligand",
	required=True,
	help="Residue name of the ligand to extract (e.g., MK1, ATP, HEM)",
	)

	parser.add_argument(
	"-o",
	"--output",
	default=None,
	help="Output prefix for files (default: input filename without extension)",
	)

	parser.add_argument(
	"--keep-waters", action="store_true", help="Keep water molecules (HOH) in protein output"
	)

	parser.add_argument(
	"--keep-hetero",
	action="store_true",
	help="Keep other HETATM records (ions, cofactors) in protein output",
	)

	parser.add_argument(
	"--all-chains",
	action="store_true",
	help="Extract ligand from all chains (default: extracts from all chains)",
	)

	return parser.parse_args()


	def read_pdb_file(filename):
	"""Read PDB file and return lines."""
	try:
	with open(filename, "r") as f:
	return f.readlines()
	except FileNotFoundError:
	print(f"Error: File '{filename}' not found.")
	sys.exit(1)
	except Exception as e:
	print(f"Error reading file: {e}")
	sys.exit(1)


	def extract_ligand(pdb_lines, ligand_name):
	"""
	Extract all lines belonging to the specified ligand.

	Returns
	-------
	tuple: (ligand_lines, ligand_found)
	"""
	ligand_lines = []
	ligand_found = False
	ligand_chains = set()

	# First pass: collect ligand lines
	for line in pdb_lines:
	if line.startswith(("HETATM", "ATOM")):
	resname = line[17:20].strip()
	if resname == ligand_name:
	ligand_found = True
	ligand_lines.append(line)
	# Track which chain this ligand is in
	chain = line[21:22]
	ligand_chains.add(chain)
	# Also include CONECT records if they reference ligand atoms
	elif line.startswith("CONECT") and ligand_found:
	ligand_lines.append(line)

	# Add header information
	header_lines = []
	header_lines.append(f"HEADER LIGAND {ligand_name}\n")
	header_lines.append(f"REMARK Extracted ligand: {ligand_name}\n")
	if ligand_chains:
	header_lines.append(f"REMARK Found in chain(s): {', '.join(sorted(ligand_chains))}\n")

	# Ensure proper PDB format
	if ligand_lines and not ligand_lines[-1].startswith("END"):
	ligand_lines.append("END\n")

	return header_lines + ligand_lines, ligand_found, ligand_chains


	def extract_protein(pdb_lines, ligand_name, keep_waters=False, keep_hetero=False):
	"""
	Extract protein without the specified ligand.

	Returns
	-------
	list: protein lines without ligand
	"""
	protein_lines = []

	for line in pdb_lines:
	# Keep ATOM records (standard protein atoms)
	if line.startswith("ATOM"):
	protein_lines.append(line)

	# Handle HETATM records
	elif line.startswith("HETATM"):
	resname = line[17:20].strip()

	# Skip the target ligand
	if resname == ligand_name:
	continue

	# Handle waters
	if resname in ["HOH", "WAT"]:
	if keep_waters:
	protein_lines.append(line)
	continue

	# Handle other HETATM (ions, cofactors, etc.)
	if keep_hetero:
	protein_lines.append(line)

	# Keep structural records
	elif line.startswith(("CRYST1", "MODEL", "ENDMDL", "TER")):
	protein_lines.append(line)

	# Keep header information
	elif line.startswith(
	(
	"HEADER",
	"TITLE",
	"COMPND",
	"SOURCE",
	"AUTHOR",
	"REVDAT",
	"REMARK",
	"SEQRES",
	"HELIX",
	"SHEET",
	)
	):
	protein_lines.append(line)

	# Ensure proper ending
	if protein_lines and not protein_lines[-1].startswith("END"):
	protein_lines.append("END\n")

	return protein_lines


	def get_ligand_info(ligand_lines):
	"""Get information about the extracted ligand."""
	atom_count = 0
	elements = set()

	for line in ligand_lines:
	if line.startswith(("HETATM", "ATOM")):
	atom_count += 1
	# Element is in columns 77-78, or can be inferred from atom name
	if len(line) > 76:
	element = line[76:78].strip()
	if element:
	elements.add(element)

	return atom_count, elements


	def write_pdb_file(lines, filename):
	"""Write lines to PDB file."""
	try:
	with open(filename, "w") as f:
	f.writelines(lines)
	return True
	except Exception as e:
	print(f"Error writing {filename}: {e}")
	return False


	def main():
	"""Main execution function."""
	args = parse_arguments()

	# Determine output prefix
	if args.output:
	output_prefix = args.output
	else:
	# Use input filename without extension
	output_prefix = Path(args.protein).stem

	print("=" * 70)
	print("PDB SPLITTER - Separate Protein and Ligand")
	print("=" * 70)
	print(f"\nInput file: {args.protein}")
	print(f"Target ligand: {args.ligand}")
	print(f"Output prefix: {output_prefix}")

	# Read PDB file
	print("\n[1/3] Reading PDB file...")
	pdb_lines = read_pdb_file(args.protein)
	print(f"✓ Read {len(pdb_lines)} lines")

	# Extract ligand
	print(f"\n[2/3] Extracting ligand '{args.ligand}'...")
	ligand_lines, ligand_found, ligand_chains = extract_ligand(pdb_lines, args.ligand)

	if not ligand_found:
	print(f"\nError: Ligand '{args.ligand}' not found in PDB file.")
	print("\nAvailable residue names:")
	unique_residues = set()
	for line in pdb_lines:
	if line.startswith(("HETATM", "ATOM")):
	resname = line[17:20].strip()
	unique_residues.add(resname)

	# Separate standard amino acids from hetero compounds
	standard_aa = {
	"ALA",
	"ARG",
	"ASN",
	"ASP",
	"CYS",
	"GLN",
	"GLU",
	"GLY",
	"HIS",
	"ILE",
	"LEU",
	"LYS",
	"MET",
	"PHE",
	"PRO",
	"SER",
	"THR",
	"TRP",
	"TYR",
	"VAL",
	}

	hetero_compounds = sorted(unique_residues - standard_aa - {"HOH", "WAT"})
	waters = sorted(unique_residues & {"HOH", "WAT"})

	if hetero_compounds:
	print("\n Hetero compounds (possible ligands):")
	for res in hetero_compounds:
	print(f" - {res}")

	if waters:
	print("\n Waters:")
	for res in waters:
	print(f" - {res}")

	print("\n Standard amino acids: (not shown)")
	sys.exit(1)

	atom_count, elements = get_ligand_info(ligand_lines)
	print(f"✓ Found {atom_count} atoms in ligand")
	if ligand_chains:
	print(f"✓ Ligand present in chain(s): {', '.join(sorted(ligand_chains))}")
	if elements:
	print(f"✓ Elements: {', '.join(sorted(elements))}")

	# Extract protein
	print(f"\n[3/3] Extracting protein without '{args.ligand}'...")
	protein_lines = extract_protein(pdb_lines, args.ligand, args.keep_waters, args.keep_hetero)

	protein_atoms = sum(1 for line in protein_lines if line.startswith("ATOM"))
	print(f"✓ Protein has {protein_atoms} atoms")

	if not args.keep_waters:
	print("✓ Removed water molecules")
	if not args.keep_hetero:
	print("✓ Removed other HETATM records")

	# Write output files
	print("\n" + "=" * 70)
	print("Writing output files...")
	print("=" * 70)

	protein_file = f"{output_prefix}_protein.pdb"
	ligand_file = f"{output_prefix}_ligand.pdb"

	if write_pdb_file(protein_lines, protein_file):
	print(f"\n✓ Protein saved to: {protein_file}")
	print(f" Contains: {protein_atoms} atoms")

	if write_pdb_file(ligand_lines, ligand_file):
	print(f"\n✓ Ligand saved to: {ligand_file}")
	print(f" Contains: {atom_count} atoms")

	# Summary
	print("\n" + "=" * 70)
	print("✓ SPLITTING COMPLETE!")
	print("=" * 70)
	print("\nOutput files:")
	print(f" 1. {protein_file} - Protein structure without ligand")
	print(f" 2. {ligand_file} - Ligand only")

	print("\nNext steps:")
	print(" • Use protein file for receptor preparation (add H, convert to PDBQT)")
	print(" • Use ligand file as reference or for ligand preparation")
	print(" • Visualize both in PyMOL or other molecular viewer")

	print("\nExample visualization in PyMOL:")
	print(f" load {protein_file}")
	print(f" load {ligand_file}")
	print(" show cartoon, *protein")
	print(" show sticks, *ligand")
	print(" color green, *ligand")
	print()


	if __name__ == "__main__":
	main()
No results found