Source code for drugforge.spectrum.calculate_rmsd

import logging
from pathlib import Path
from typing import Union

import MDAnalysis as mda
import numpy as np
import pymol2
from Bio import pairwise2
from drugforge.data.backend.openeye import load_openeye_pdb, save_openeye_pdb
from drugforge.modeling.modeling import superpose_molecule
from drugforge.spectrum.blast import pdb_to_seq


[docs] def rmsd_alignment( target_pdb: str, ref_pdb: str, final_pdb: str, target_chain="A", ref_chain="A", ) -> tuple[float, Path]: """Calculate RMSD of a molecule against a reference Parameters ---------- target_pdb : str Path to PDB of protein to align. ref_pdb : str Path to PDB to align target against. final_pdb : str Path to save PDB of aligned target. target_chain : str, optional The chain of target which will be used for alignment, by default "A" ref_chain : str, optional The chain of reference which will be used for alignment, by default "A" Returns ------- float, str RMSD after alignment, Path to saved PDB """ protein = load_openeye_pdb(target_pdb) ref_protein = load_openeye_pdb(ref_pdb) aligned_protein, rmsd = superpose_molecule( ref_protein, protein, ref_chain=ref_chain, mobile_chain=target_chain ) pdb_aligned = save_openeye_pdb(aligned_protein, final_pdb) return rmsd, pdb_aligned
[docs] def select_best_colabfold( results_dir: str, seq_name: str, pdb_ref: str, chain="A", final_pdb="aligned_protein.pdb", fold_model="alphafold2_ptm", ) -> tuple[float, str]: """Select the best seed output (repetition) from a ColabFold run based on its RMSD wrt the reference. Parameters ---------- results_dir : str The directory containing the ColabFold results. seq_name : str The name we gave to the sequence in the csv file. pdb_ref : str The path to the PDB of te reference protein. chain : str, optional Chain of both reference and generated PDB that will be used, by default "A" final_pdb : str, optional Path to the PDB where aligned structure will be saved, by default "aligned_protein.pdb" fold_model : str, optional The model used for ColabFold, by default "alphafold2_ptm" Returns ------- Tuple[float, str] RMSD after alignment, Path to saved PDB Raises ------ FileNotFoundError The directory with ColabFold results does not exist """ rmsds = [] seeds = [] file_seed = [] results_dir = Path(results_dir) if not results_dir.exists(): raise FileNotFoundError( f"A folder with ColbFold results {results_dir} does not exist" ) for file_path in results_dir.glob(f"{seq_name}*_{fold_model}_model_1_seed_*.pdb"): pdb_to_compare = file_path seed = str(pdb_to_compare).split("_")[-1].split(".")[0] rmsd, pdb = rmsd_alignment(pdb_to_compare, pdb_ref, final_pdb, chain, chain) rmsds.append(rmsd) seeds.append(seed) file_seed.append(file_path) logging.info(f"RMSD for seed {seed} is {rmsd} A") if len(rmsds) == 0: logging.warning( f"No ColabFold entry for {seq_name} and model {fold_model} found." ) return 0, "" min_rmsd = np.argmin(rmsds) min_rmsd_file = file_seed[min_rmsd] logging.info( f"{seq_name} seed with least RMSD is {seeds[min_rmsd]} with RMSD {rmsds[min_rmsd]} A" ) min_rmsd, final_pdb = rmsd_alignment( min_rmsd_file, pdb_ref, final_pdb, chain, chain ) return min_rmsd, str(final_pdb)
[docs] def save_alignment_pymol( pdbs: list, labels: list, reference: str, session_save: str, align_chain=str, color_by_rmsd=False, ) -> None: """Imports the provided PDBs into a Pymol session and saves Parameters ---------- pdbs : list List with paths to pdb file to include. labels : list List with labels that will be used in protein objects. reference : str Path to reference PDB. session_save : str File name for the saved PyMOL session. align_chain : str Chain of ref to align target with. color_by_rmsd : bool, optional Option to color aligned targets by RMSD with respect to reference. """ def hide_chain(p, chain, obj): """Hide the other chain from visualization in obj""" dimer_chains = {"A", "B"} hide_chain = (dimer_chains - {chain}).pop() p.cmd.select("chainb", f"{obj} and chain {hide_chain.upper()}") p.cmd.remove("chainb") p.cmd.delete("chainb") p = pymol2.PyMOL() p.start() p.cmd.load(reference, object="ref_protein") p.cmd.color("gray", "ref_protein") # Optionaly remove other chain from the reference protein if align_chain == "both": align_sel = "" else: align_sel = f" and chain {align_chain}" hide_chain(p, align_chain, "ref_protein") p.cmd.select("chaina", f"ref_protein{align_sel}") p.cmd.color("gray", "ref_protein") for i, pdb in enumerate(pdbs): if len(str(pdb)) > 0: # In case the entry is empty (when no CF output was found) pname = labels[i] p.cmd.load(pdb, object=pname) # PDBs should be aligned but in case they are not p.cmd.select("chainp", pname + align_sel) # It's better to align wrt a single chain than the whole protein (at least one binding site to compare) p.cmd.align(f"{pname} and chain A", "ref_protein and chain A") if color_by_rmsd: colorbyrmsd(p, "chainp", "chaina", minimum=0, maximum=2) p.cmd.color("red", "ref_protein") if len(align_chain) == 1: hide_chain(p, align_chain, pname) p.cmd.delete("chainp") p.cmd.delete("chaina") # set visualization p.cmd.set("bg_rgb", "white") p.cmd.bg_color("white") p.cmd.hide("everything") p.cmd.show("cartoon") p.cmd.set("transparency", 0.8) p.cmd.set("transparency", 0.3, "ref_protein") # Color ligand and binding site p.cmd.select("ligand", "resn UNK or resn LIG") p.cmd.select( "binding_site", "name CA within 5 of resn UNK or name CA within 5 resn LIG" ) p.cmd.show("sticks", "ligand") p.cmd.color("red", "ligand") p.cmd.save(session_save) return
[docs] def convert_chain_id(chain: str) -> str: """Convert a chain identifier between letter and number representations. The ASCII values for lowercase letters start from a=97, so we subtract/add 96 to convert. Parameters ---------- chain : str 1 or 3-letter chain identifier. Returns ------- str Converted chain identifier. """ if chain.isalpha(): return ord(chain.lower()) - 96 elif chain.isdigit(): return chr(int(chain) + 96) else: logging.warning(f"Chain identifier {chain} is not valid.") return chain
[docs] def colorbyrmsd( p: pymol2.PyMOL, target_sel: str, ref_sel: str, quiet=True, minimum=None, maximum=None, ): """Color aligned proteins by RMSD with respect to the target. Based on script by original authors Shivender Shandilya and Jason Vertrees, rewrite by Thomas Holder. License: BSD-2-Clause. http://pymolwiki.org/index.php/ColorByRMSD Parameters ---------- p : pymol2.PyMOL Pymol session target_sel : str Selection of aligned target ref_sel : str Selection of reference protein quiet : bool, optional Not print RMSD info, by default True minimum : Union[int,float], optional Set a fixed min RMSD for coloring, by default None maximum : Union[int,float], optional Set a fixed max RMSD for coloring, by default None """ from chempy import cpv selboth, aln = "both", "aln" p.cmd.align(target_sel, ref_sel, cycles=0, transform=0, object=aln) p.cmd.select(selboth, f"{target_sel} or {ref_sel}") idx2coords = {} p.cmd.iterate_state( -1, selboth, "idx2coords[model,index] = (x,y,z)", space=locals() ) if p.cmd.count_atoms("?" + aln, 1, 1) == 0: p.cmd.refresh() b_dict = {} for col in p.cmd.get_raw_alignment(aln): assert len(col) == 2 b = cpv.distance(idx2coords[col[0]], idx2coords[col[1]]) for idx in col: b_dict[idx] = b p.cmd.alter(selboth, "b = b_dict.get((model, index), -1)", space=locals()) p.cmd.orient(selboth) p.cmd.show_as("cartoon", "byobj " + selboth) p.cmd.color("gray", selboth) p.cmd.spectrum("b", "red_blue", selboth + " and b > -0.5", minimum, maximum) # Make colorbar if minimum is not None and maximum is not None: p.cmd.ramp_new("colorbar", "none", [minimum, maximum], ["red", "blue"]) if not quiet: logging.info("ColorByRMSD: Minimum Distance: %.2f" % (min(b_dict.values()))) logging.info("ColorByRMSD: Maximum Distance: %.2f" % (max(b_dict.values()))) logging.info( "ColorByRMSD: Average Distance: %.2f" % (sum(b_dict.values()) / len(b_dict)) ) p.cmd.delete(aln) p.cmd.delete(selboth) return
[docs] def get_residue_mapping(seq_ref: str, seq_mob: str) -> tuple[int, int]: """Aligns two sequences and returns the correct start and end residue indices, ignoring gaps Parameters ---------- seq_ref : str string of reference sequence seq_mob : str string of mobile sequence Returns ------- tuple[int, int] start and end residue indices (1-based indexing) """ alignments = pairwise2.align.globalxx(seq_ref, seq_mob) aligned_ref, aligned_mob = alignments[0][:2] # Extract first alignment ref_idx, mob_idx = [], [] r, m = 0, 0 # Residue counters (no gaps) for a, b in zip(aligned_ref, aligned_mob): ref_idx.append(r if a != "-" else None) mob_idx.append(m if b != "-" else None) r += a != "-" m += b != "-" # Find first and last matched residues start_idx = next( i for i in range(len(ref_idx)) if ref_idx[i] is not None and mob_idx[i] is not None ) end_idx = next( i for i in range(len(ref_idx) - 1, -1, -1) if ref_idx[i] is not None and mob_idx[i] is not None ) return start_idx + 1, end_idx + 1 # 1-based indexing
[docs] def find_bsite_resids( pdb: str, pdb_ref: str, aligned_temp: str, ligres: str = "UNK", chain_m: str = "A", chain_r: str = "A", bsite_dist: float = 4.5, res_threshold: int = 5, ): """Find binding site residues in a protein-ligand complex based on ligand proximity. Parameters ---------- pdb : str Path to PDB of protein to align. pdb_ref : str Path to reference PDB to align target against. aligned_temp : str Path to save aligned PDB. ligres : str, optional Residue name of ligand, by default "UNK" chain_m : str, optional Chain ID of mobile, by default "A" chain_r : str, optional Chain ID of ref, by default "A" bsite_dist : float, optional Distance to ligand in angstroms defining binding site, by default 4.5 res_threshold : int, optional Threshold of residue IDs that can be assigned to binding site. Avoids asigning terminal atoms. By default 5 Returns ------- tuple[np.ndarray, np.ndarray] Arrays of binding site residue IDs in mobile and reference structures. Raises ------ ValueError No binding site residues have an idx above res_threshold """ from scipy.spatial.distance import cdist rmsd, pdb_aln = rmsd_alignment(pdb, pdb_ref, aligned_temp, chain_m, chain_r) u = mda.Universe(pdb_aln) u_ref = mda.Universe(pdb_ref) # Initial atom selections bs_atoms = u_ref.select_atoms( f"protein and chainid {chain_r} and around {bsite_dist} resname {ligres}" ) lig_atoms = u_ref.select_atoms(f"chainid {chain_r} and resname {ligres}") ca_mob = u.select_atoms(f"protein and chainid {chain_m} and name CA").atoms # Handle incorrect chain selections if len(ca_mob) == 0: logging.warning("The mobile chain is incorrect, attempting to fix") chain_m = convert_chain_id(chain_m) ca_mob = u.select_atoms(f"protein and chainid {chain_m} and name CA").atoms if len(bs_atoms) == 0: logging.warning("The reference chain is incorrect, attempting to fix") chain_r = convert_chain_id(chain_r) bs_atoms = u_ref.select_atoms( f"protein and chainid {chain_r} and around {bsite_dist} resname {ligres}" ) lig_atoms = u_ref.select_atoms(f"chainid {chain_r} and resname {ligres}") ref_pos = lig_atoms.positions bs_ref = np.unique(bs_atoms.resids) bs_ref = bs_ref[ bs_ref >= res_threshold ] # In case terminal residue is categorized as binding site n_res = len(bs_ref) if len(bs_ref) == 0: raise ValueError(f"No binding site residues have an idx above {res_threshold}") distances = cdist(ca_mob.positions, ref_pos, metric="euclidean") sorted_flat = np.argsort(distances.ravel()) rows, _ = np.unravel_index(sorted_flat, distances.shape) res_seen = set() bs_mob = [ ca_mob.resids[r] for r in rows if (ca_mob.resids[r] not in res_seen and not res_seen.add(ca_mob.resids[r])) ] return np.sort(bs_mob[:n_res]), bs_ref
[docs] def get_binding_site_rmsd( file_mob: Union[Path, str], file_ref: Union[Path, str], bsite_dist: float = 4.5, rmsd_mode: str = "CA", chain_mob: str = "A", chain_ref: str = "A", ligres: str = "LIG", lig_ref_pdb: str = None, chain_ref2: str = "A", aligned_temp: str = None, ) -> float: """Calculate RMSD for the Binding Site residues between file_mob and file_ref The binding site is defined as all residues protein within bsite_dist Angs of the ligand in the reference protein. If the ligand is missing in the reference, an auxiliary reference (`lig_ref_pdb`) may be provided to define the binding site. Parameters ---------- file_mob : Union[Path, str] Path to target file file_ref : Union[Path, str] Path to reference file bsite_dist : float, optional Cutoff distance to ligand in Angs, by default 4.5 rmsd_mode : str, optional Type of RMSD to compute: 'CA' for alpha carbons or 'heavy' for all heavy atoms, by default "CA". chain_mob : str, optional Chain ID of target to compare, by default "1" chain_ref : str, optional Chain ID of reference to compare, by default "1" ligres : str, optional Resname of ligand residue, by default "UNK" lig_ref_pdb : Path, optional Optional Reference PDB to define binding site, in case ref doesn't have a ligand, by default None chain_ref2 : str, optional Optional reference Chain ID, used when the main reference doesn't have a ligand, by default "A" aligned_temp : Path, optional Optional folder to store alignment PDB when calculating bsite with additional PDB ref, by default None Returns ------- float RMSD of binding site, or -1 if RMSD could not be calculated. Raises ------ ValueError No ligand found in reference, and not second reference provided. """ file_mob = str(file_mob) file_ref = str(file_ref) u = mda.Universe(file_mob).select_atoms( f"protein and segid {chain_mob} and not resname ACE and not resname NME" ) u_ref = mda.Universe(file_ref).select_atoms( f"protein and segid {chain_ref} and not resname ACE and not resname NME" ) u_ref_l = mda.Universe(file_ref).select_atoms( f"(protein and segid {chain_ref} and not resname ACE and not resname NME) or resname {ligres}" ) u_lig = u_ref_l.select_atoms(f"resname {ligres}") bs_atoms = u_ref_l.select_atoms(f"protein and around {bsite_dist} resname {ligres}") # Handle reference with no ligand if len(u_lig) == 0: if lig_ref_pdb and aligned_temp: bs_ids, __ = find_bsite_resids( file_mob, lig_ref_pdb, aligned_temp, ligres, chain_mob, chain_ref2, bsite_dist, res_threshold=5, ) bs_atoms = u_ref_l.select_atoms(" or ".join([f"resid {r}" for r in bs_ids])) else: raise ValueError( f"No ligand found in ref with resname {ligres}. Provide a correct ligand name or a second reference PDB." ) # Align sequences to ensure residue numbering is consistent seq_mob = pdb_to_seq(Path(file_mob), chain=str(chain_mob)).seq.replace("X", "") seq_ref = pdb_to_seq(Path(file_ref), chain=str(chain_ref)).seq.replace("X", "") start_resid, end_resid = get_residue_mapping(seq_ref, seq_mob) u_ref_filter = u_ref.select_atoms(f"resid {start_resid}:{end_resid}") u = u.select_atoms(f"resid {start_resid}:{end_resid}") # Select binding site residues binding_site = [ r.resid - start_resid + 1 for r in bs_atoms.residues if "CA" in r.atoms.names ] binding_site_n = [ str(r.resname) for r in bs_atoms.residues if "CA" in r.atoms.names ] binding_site_m = [] binding_site_r = [] for i, r in enumerate(binding_site): res = u.residues[r - 1] if binding_site_n[i] == res.resname: binding_site_m.append(res.resid) binding_site_r.append(r) else: logging.warning( f"Missmatch in residue with index {i} {binding_site_n[i]} != {res.resname}" ) sel_bs = " or ".join(f"resid {r}" for r in binding_site_r) sel_bs_m = " or ".join(f"resid {r}" for r in binding_site_m) if len(sel_bs_m) > 0: u_bs_m = u.select_atoms(sel_bs_m) else: alignment = pairwise2.align.globalms(seq_mob, seq_ref, 2, -1, -0.8, -0.5)[0] logging.warning( "The sequences may be different! \n", pairwise2.format_alignment(*alignment) ) return -1 u_bs_ref = u_ref_filter.select_atoms(sel_bs) rmsd_sel = "name CA" if rmsd_mode == "CA" else "not name H*" m_pos, ref_pos = u_bs_m.select_atoms(rmsd_sel), u_bs_ref.select_atoms(rmsd_sel) # Match common atoms per residue mpos_list, refpos_list = [], [] for mob_res, ref_res in zip(m_pos.residues, ref_pos.residues): common_atoms = set(mob_res.atoms.names).intersection(ref_res.atoms.names) mob_common, ref_common = mob_res.atoms.select_atoms( f"name {' or name '.join(common_atoms)}" ), ref_res.atoms.select_atoms(f"name {' or name '.join(common_atoms)}") if len(mob_common) == len(ref_common): mpos_list.append(mob_common.positions) refpos_list.append(ref_common.positions) try: rmsd = np.sqrt( ((np.vstack(mpos_list) - np.vstack(refpos_list)) ** 2).sum(-1).mean() ) except ValueError: logging.error(f"Error: Mismatched lengths ({len(m_pos)} vs {len(ref_pos)})") rmsd = -1 return rmsd