Source code for drugforge.data.backend.openeye

from base64 import b64decode, b64encode
from functools import reduce
from pathlib import Path
from typing import Any, Dict, List, Optional, Union  # noqa: F401
from warnings import warn

from drugforge.data.schema.schema_base import MoleculeFilter
from openeye import (  # noqa: F401
    oechem,
    oedepict,
    oedocking,
    oeff,
    oegraphsim,
    oegrid,
    oeomega,
    oequacpac,
    oeshape,
    oespruce,
    oeszybki,
)

# exec on module import

if not oechem.OEChemIsLicensed("python"):
    warn("OpenEye license required to use drugforge openeye module")


[docs] def combine_protein_ligand( prot: oechem.OEMol, lig: oechem.OEMol, lig_name: str = "LIG", lig_chain: Optional[str] = "", resid: Optional[int] = None, start_atom_id: Optional[int] = None, ) -> oechem.OEMol: """ Combine a protein OEMol and ligand OEMol into one, handling residue/atom numbering, and HetAtom status. Parameters ---------- prot : oechem.OEMol OEMol with the protein atoms. This should have perceived resiudes lig : oechem.OEMol OEMol with the ligand atoms lig_name : str, default="LIG" Residue name to give to the ligand atoms resid : int, optional Which residue number to assign to the ligand. If not given, the largest existing residue number in `prot` will be found, and the ligand will be assigned the next number start_atom_id : int, optional Which atom number to assign to the first atom in the ligand. If not given, the next available atom number will be calculated and assigned Returns ------- oechem.OEMol Combined molecule, with the appropriate biopolymer field set for the lig atoms """ # Calculate residue number if necessary if resid is None: # Find max resid for numbering the ligand residue # Add 1 so we start numbering at the next residue id resid = max([r.GetResidueNumber() for r in oechem.OEGetResidues(prot)]) + 1 # Calculate atom number if necessary if start_atom_id is None: # Same with atom numbering start_atom_id = ( max([oechem.OEAtomGetResidue(a).GetSerialNumber() for a in prot.GetAtoms()]) + 1 ) # Make copies so we don't modify the original molecules prot = prot.CreateCopy() lig = lig.CreateCopy() # Keep track of how many times each element has been seen in the ligand # Each atom in a residue needs a unique name, so just append this number to the # element num_elem_atoms = {} # Adjust molecule residue properties for a in lig.GetAtoms(): # Set atom name cur_name = oechem.OEGetAtomicSymbol(a.GetAtomicNum()) try: new_name = f"{cur_name}{num_elem_atoms[cur_name]}" num_elem_atoms[cur_name] += 1 except KeyError: new_name = cur_name num_elem_atoms[cur_name] = 1 a.SetName(new_name) # Set residue level properties res = oechem.OEAtomGetResidue(a) res.SetName(lig_name.upper()) res.SetResidueNumber(resid) res.SetSerialNumber(start_atom_id) if lig_chain: res.SetChainID(lig_chain) start_atom_id += 1 res.SetHetAtom(True) oechem.OEAtomSetResidue(a, res) # Combine the mols oechem.OEAddMols(prot, lig) return prot
[docs] def load_openeye_pdb( pdb_fn: Union[str, Path], alt_loc: bool = False ) -> oechem.OEGraphMol: """ Load an OpenEye OEGraphMol from a PDB file. Parameters ---------- pdb_fn : Union[str, Path] The path to the input PDB file. alt_loc : bool, optional Whether to keep track of alternate locations, by default False. Returns ------- oechem.OEGraphMol The OEGraphMol loaded from the input file. Raises ------ FileNotFoundError If the specified file does not exist. oechem.OEError If the CIF file cannot be opened. """ if not Path(pdb_fn).exists(): raise FileNotFoundError(f"{pdb_fn} does not exist!") ifs = oechem.oemolistream() ifs_flavor = oechem.OEIFlavor_PDB_Default | oechem.OEIFlavor_PDB_DATA # Add option for keeping track of alternat locations in PDB file if alt_loc: ifs_flavor |= oechem.OEIFlavor_PDB_ALTLOC ifs.SetFlavor( oechem.OEFormat_PDB, ifs_flavor, ) if ifs.open(str(pdb_fn)): in_mol = oechem.OEGraphMol() oechem.OEReadMolecule(ifs, in_mol) ifs.close() return in_mol else: oechem.OEThrow.Fatal(f"Unable to open {pdb_fn}")
[docs] def load_openeye_smi(smi_fn: Union[str, Path]) -> list[oechem.OEGraphMol]: """ Load an OpenEye SMILES file containing a set of molecules and return them as OpenEye OEGraphMol objects. Parameters ---------- smi_fn : Union[str, Path] Path to the SMILES file to load. Returns ------- list[oechem.OEGraphMol] A list of OpenEye OEGraphMol objects corresponding to the data from the SMI file. Raises ------ FileNotFoundError If the specified file does not exist. oechem.OEError If the SMI file cannot be opened. """ # convert to path to make consistent smi_fn = Path(smi_fn) if not smi_fn.exists(): raise FileNotFoundError(f"{str(smi_fn)} does not exist!") ifs = oechem.oemolistream(smi_fn.as_posix()) ifs.SetFlavor(oechem.OEFormat_SMI, oechem.OEIFlavor_SMI_DEFAULT) molecules = [] for mol in ifs.GetOEGraphMols(): molecules.append(oechem.OEGraphMol(mol)) return molecules
[docs] def load_openeye_cif1(cif1_fn: Union[str, Path]) -> oechem.OEGraphMol: """ Loads a biological assembly file into an OEGraphMol object. Current version requires going through an OpenMM intermediate. Parameters ---------- cif1_fn : Union[str, Path] The path to the input CIF1 file. Returns ------- oechem.OEGraphMol oechem.OEGraphMol: the biological assembly as an OEGraphMol object. """ from tempfile import NamedTemporaryFile from openmm.app import PDBFile, PDBxFile if not Path(cif1_fn).exists(): raise FileNotFoundError(f"{cif1_fn} does not exist!") cif = PDBxFile(str(cif1_fn)) # the keep ids flag is critical to make sure the residue numbers are correct with NamedTemporaryFile("w", suffix=".pdb") as f: PDBFile.writeFile(cif.topology, cif.positions, f, keepIds=True) prot = load_openeye_pdb(f.name) return prot
[docs] def load_openeye_cif( cif_fn: Union[str, Path], alt_loc: bool = False ) -> oechem.OEGraphMol: """ Load an OpenEye OEGraphMol object from a CIF file. Parameters ---------- cif_fn : Union[str, Path] The path of the CIF file to read. alt_loc : bool, optional If True, include alternative locations for atoms in the resulting OEGraphMol object. Default is False. Returns ------- oechem.OEGraphMol The OEGraphMol object read from the CIF file. Raises ------ FileNotFoundError If the specified file does not exist. oechem.OEError If the CIF file cannot be opened. Notes ----- This function will raise an exception if the specified file does not exist or if the CIF file cannot be opened. If `alt_loc` is True, the resulting OEGraphMol object will include alternative locations for atoms. """ if not Path(cif_fn).exists(): raise FileNotFoundError(f"{cif_fn} does not exist!") ifs = oechem.oemolistream() ifs_flavor = oechem.OEIFlavor_MMCIF_DEFAULT # Add option for keeping track of alternat locations in PDB file # TODO: check if this is a thing in mmcif # TODO: Currently this actually fails on biological assemblies so I'm not using it but it *should* work if not alt_loc: ifs_flavor |= oechem.OEIFlavor_MMCIF_NoAltLoc ifs.SetFlavor( oechem.OEFormat_MMCIF, ifs_flavor, ) if ifs.open(str(cif_fn)): in_mol = oechem.OEGraphMol() oechem.OEReadMolecule(ifs, in_mol) ifs.close() return in_mol else: oechem.OEThrow.Fatal(f"Unable to open {cif_fn}")
[docs] def load_openeye_mol2(mol2_fn: Union[str, Path]) -> oechem.OEMol: """ Load an OpenEye MOL2 file and return it as an OpenEye OEMol object. Reads multiple conformers into the OEMol object but if the MOL2 file contains multiple molecules, it will only return the first one. Parameters ---------- MOL2_fn : Union[str, Path] Path to the MOL2 file to load. Returns ------- oechem.OEMol An OpenEye OEMol object containing the molecule data from the MOL2 file. Raises ------ FileNotFoundError If the specified file does not exist. oechem.OEError If the MOL2 file cannot be opened. Notes ----- This function assumes that the MOL2 file contains a single molecule. If the file contains more than one molecule, only the first molecule will be loaded. """ if not Path(mol2_fn).exists(): raise FileNotFoundError(f"{mol2_fn} does not exist!") ifs = oechem.oemolistream() ifs.SetFlavor( oechem.OEFormat_MOL2, oechem.OEIFlavor_MOL2_Default, ) ifs.SetConfTest(oechem.OEOmegaConfTest()) if ifs.open(str(mol2_fn)): for mol in ifs.GetOEMols(): ifs.close() return mol else: oechem.OEThrow.Fatal(f"Unable to open {mol2_fn}")
[docs] def load_openeye_sdf(sdf_fn: Union[str, Path]) -> oechem.OEMol: """ Load an OpenEye SDF file and return it as an OpenEye OEMol object. Reads multiple conformers into the OEMol object but if the sdf file contains multiple molecules, it will only return the first one. Parameters ---------- sdf_fn : Union[str, Path] Path to the SDF file to load. Returns ------- oechem.OEMol An OpenEye OEMol object containing the molecule data from the SDF file. Raises ------ FileNotFoundError If the specified file does not exist. oechem.OEError If the SDF file cannot be opened. Notes ----- This function assumes that the SDF file contains a single molecule. If the file contains more than one molecule, only the first molecule will be loaded. """ if not Path(sdf_fn).exists(): raise FileNotFoundError(f"{sdf_fn} does not exist!") ifs = oechem.oemolistream() ifs.SetFlavor( oechem.OEFormat_SDF, oechem.OEIFlavor_SDF_Default, ) ifs.SetConfTest(oechem.OEOmegaConfTest()) if ifs.open(str(sdf_fn)): for mol in ifs.GetOEMols(): ifs.close() return mol else: oechem.OEThrow.Fatal(f"Unable to open {sdf_fn}")
[docs] def load_openeye_sdfs(sdf_fn: Union[str, Path]) -> list[oechem.OEGraphMol]: """ Load a list of OpenEye OEGraphMol objects from an SDF file. Parameters ---------- sdf_fn : Union[str, Path] The path of the SDF file to read. Returns ------- list of oechem.OEGraphMol A list of the OEGraphMol objects read from the SDF file. Raises ------ FileNotFoundError If the specified file does not exist. oechem.OEError If the CIF file cannot be opened. Notes ----- This function will raise an exception if the specified file does not exist or if the SDF file cannot be opened. """ if not Path(sdf_fn).exists(): raise FileNotFoundError(f"{sdf_fn} does not exist!") ifs = oechem.oemolistream() ifs.SetFlavor( oechem.OEFormat_SDF, oechem.OEIFlavor_SDF_Default, ) cmpd_list = [] if ifs.open(str(sdf_fn)): for mol in ifs.GetOEGraphMols(): cmpd_list.append(mol.CreateCopy()) ifs.close() return cmpd_list else: oechem.OEThrow.Fatal(f"Unable to open {sdf_fn}")
[docs] def load_openeye_design_unit(du_fn: Union[str, Path]) -> oechem.OEDesignUnit: """ Load an OpenEye design unit from a file Parameters ---------- du_fn : Union[str, Path] The path of the DesignUnit file to read. Returns ------- oechem.OEDesignUnit OpenEye DesignUnit Raises ------ FileNotFoundError If the specified file does not exist. oechem.OEError If the CIF file cannot be opened. """ if not Path(du_fn).exists(): raise FileNotFoundError(f"{du_fn} does not exist!") du = oechem.OEDesignUnit() retcode = oechem.OEReadDesignUnit(str(du_fn), du) if not retcode: oechem.OEThrow.Fatal(f"Unable to open {du_fn}") return du
[docs] def save_openeye_pdb(mol, pdb_fn: Union[str, Path]) -> Path: """ Write an OpenEye OEGraphMol object to a PDB file. Parameters ---------- mol : oechem.OEGraphMol The OEGraphMol object to write to the PDB file. pdb_fn : Union[str, Path] The path of the PDB file to create or overwrite. Returns ------- Path The path of the PDB file that was written. Notes ----- This function will overwrite any existing file with the same name as `pdb_fn`. """ ofs = oechem.oemolostream() ofs.SetFlavor(oechem.OEFormat_PDB, oechem.OEOFlavor_PDB_Default) if ofs.open(str(pdb_fn)): oechem.OEWriteMolecule(ofs, mol) else: oechem.OEThrow.Fatal(f"Unable to open {pdb_fn}") ofs.close() return Path(pdb_fn)
[docs] def save_openeye_sdf(mol, sdf_fn: Union[str, Path]) -> Path: """ Write an OpenEye OEGraphMol object to an SDF file. Parameters ---------- mol : oechem.OEGraphMol The OEGraphMol object to write to the SDF file. sdf_fn : Union[str, Path] The path of the SDF file to create or overwrite. Returns ------- Path The path of the SDF file that was written. Notes ----- This function will overwrite any existing file with the same name as `sdf_fn`. """ ofs = oechem.oemolostream() ofs.SetFlavor(oechem.OEFormat_SDF, oechem.OEOFlavor_SDF_Default) if ofs.open(str(sdf_fn)): oechem.OEWriteMolecule(ofs, mol) else: oechem.OEThrow.Fatal(f"Unable to open {sdf_fn}") ofs.close() return Path(sdf_fn)
[docs] def save_openeye_sdfs(mols, sdf_fn: Union[str, Path]) -> Path: """ Write a list of OpenEye OEGraphMol objects to a single SDF file. Parameters ---------- mols : list of oechem.OEGraphMol The list of OEGraphMol objects to write to the SDF file. sdf_fn : Union[str, Path] The path of the SDF file to create or overwrite. Returns ------- Path The path of the SDF file that was written. Raises ------ oechem.OEError If the SDF file cannot be opened. Notes ----- This function will overwrite any existing file with the same name as `sdf_fn`. """ ofs = oechem.oemolostream() ofs.SetFlavor( oechem.OEFormat_SDF, oechem.OEOFlavor_SDF_Default, ) if ofs.open(str(sdf_fn)): for mol in mols: oechem.OEWriteMolecule(ofs, mol) ofs.close() else: oechem.OEThrow.Fatal(f"Unable to open {sdf_fn}") return Path(sdf_fn)
[docs] def save_openeye_design_unit(du: oechem.OEDesignUnit, du_fn: Union[str, Path]) -> Path: """ Write an OpenEye design unit to a file Parameters ---------- du : oechem.OEDesignUnit The OpenEye DesignUnit to write to the file. du_fn : Union[str, Path] The path of the DesignUnit file to create or overwrite. Returns ------- Path The path of the DesignUnit file that was written. Raises ------ oechem.OEError If the DesignUnit file cannot be opened. Notes ----- This function will overwrite any existing file with the same name as `du_fn`. """ retcode = oechem.OEWriteDesignUnit(str(du_fn), du) if not retcode: oechem.OEThrow.Fatal(f"Unable to open {du_fn}") return Path(du_fn)
[docs] def openeye_perceive_residues( prot: oechem.OEGraphMol, preserve_all: bool = False ) -> oechem.OEGraphMol: """ Re-perceive the residues of a protein molecule using OpenEye's OEPerceiveResidues function, which is necessary when changes are made to the protein to ensure correct atom ordering and CONECT record creation. Parameters ---------- prot : oechem.OEGraphMol The input protein molecule to be processed. preserve_all : bool, optional, default=False If True, preserve all residue information, including chain ID, residue number, and residue name. Returns ------- oechem.OEGraphMol The processed protein molecule with re-perceived residue information. """ # Clean up PDB info by re-perceiving, perserving chain ID, residue number, and residue name if preserve_all: preserve = oechem.OEPreserveResInfo_All else: preserve = ( oechem.OEPreserveResInfo_ChainID | oechem.OEPreserveResInfo_ResidueNumber | oechem.OEPreserveResInfo_ResidueName ) oechem.OEPerceiveResidues(prot, preserve) return prot
[docs] def save_receptor_grid(du_fn: Union[str, Path], out_fn: Union[str, Path]) -> Path: """ Load in a design unit from a file and write out the receptor grid as a .ccp4 grid file. Parameters ---------- du_fn: Union[str, Path] File name/path with the design units. out_fn: Union[str, Path] Works with a .ccp4 extension Returns ------- Path Path to the receptor grid file """ du = oechem.OEDesignUnit() oechem.OEReadDesignUnit(str(du_fn), du) # oedocking.OEMakeReceptor(du) oegrid.OEWriteGrid( str(out_fn), oegrid.OEScalarGrid(du.GetReceptor().GetNegativeImageGrid()), ) return Path(out_fn)
[docs] def openeye_copy_pdb_data( source: oechem.OEGraphMol, destination: oechem.OEGraphMol, tag: str ) -> None: """ Copy over the PDB data from one object to another. Tag examples include "SEQRES" Parameters ---------- source: oechem.OEGraphMol Source molecule to copy the data from destination: oechem.OEGraphMol Destination molecule to copy the data tag: str Tag identifier for the data/metadata. Returns ------- """ # first, delete data with that tag oechem.OEDeletePDBData(destination, tag) # now, add over all the data with the tag for data_pair in oechem.OEGetPDBDataPairs(source): if data_pair.GetTag() == tag: oechem.OEAddPDBData(destination, data_pair)
[docs] def oemol_to_sdf_string(mol: oechem.OEMol) -> str: """ Dumps an OpenEye OEMol to an SDF string Parameters ---------- mol: oechem.OEMol OpenEye OEMol Returns ------- str SDF string representation of the input OEMol """ oms = oechem.oemolostream() oms.SetFormat(oechem.OEFormat_SDF) oms.openstring() oechem.OEWriteMolecule(oms, mol) molstring = oms.GetString().decode("UTF-8") return molstring
[docs] def sdf_string_to_oemol(sdf_str: str) -> oechem.OEMol: """ Loads an SDF string into an openeye molecule. Enables multiple conformers but only returns the first molecule in a multi molecule sdf Parameters ---------- sdf_str: str The string representation of an SDF file Returns ------- oechem.OEMol: resulting OpenEye OEMol """ ims = oechem.oemolistream() ims.SetFormat(oechem.OEFormat_SDF) ims.SetFlavor( oechem.OEFormat_SDF, oechem.OEIFlavor_SDF_Default, ) rmol = oechem.OEMol() ims.SetConfTest(oechem.OEOmegaConfTest()) if ims.openstring(sdf_str): for mol in ims.GetOEMols(): rmol = mol.CreateCopy() break # only return the first molecule return rmol
[docs] def smiles_to_oemol(smiles: str) -> oechem.OEMol: """ Loads a smiles string into an openeye molecule Parameters ---------- smiles: str SMILES string Returns ------- oechem.OEGraphMol resulting OpenEye OEGraphMol """ mol = oechem.OEMol() oechem.OESmilesToMol(mol, smiles) return mol
[docs] def oemol_to_smiles(mol: oechem.OEMol, isomeric=True) -> str: """ Canonical SMILES string of an OpenEye OEMol Paramers -------- mol: oechem.OEMol OpenEye OEMol isomeric: bool, optional, default=True If True, generate canonical isomeric SMILES (including stereochem) If False, generate canonical SMILES without stereochem Returns ------- str SMILES string of molecule """ # By default, OEMolToSmiles generates isomeric SMILES, which includes stereochemistry if isomeric: return oechem.OEMolToSmiles(mol) # However, if we want to treat two stereoisomers as the same molecule, # we can generate canonical SMILES that don't include stereo info else: return oechem.OECreateCanSmiString(mol)
[docs] def oe_smiles_roundtrip(smiles: str) -> str: """ Canonical SMILES string of an OpenEye OEMol Paramers -------- mol: oechem.OEMol OpenEye OEMol Returns ------- str SMILES string of molecule """ mol = smiles_to_oemol(smiles) return oemol_to_smiles(mol)
[docs] def oemol_to_inchi(mol: oechem.OEMol, fixed_hydrogens: bool = False) -> str: """ InChI string of an OpenEye OEMol Paramers -------- mol: oechem.OEMol OpenEye OEMol fixed_hydrogens: bool If a fixed hydrogen layer should be added to the InChI, if `True` this will result in a non-standard inchi which can distinguish tautomers. Returns ------- str InChI string of molecule """ if fixed_hydrogens: inchi_opts = oechem.OEInChIOptions() inchi_opts.SetFixedHLayer(True) inchi = oechem.OEMolToInChI(mol) else: inchi = oechem.OEMolToSTDInChI(mol) return inchi
[docs] def oemol_to_inchikey(mol: oechem.OEMol, fixed_hydrogens: bool = False) -> str: """ InChI key string of an OpenEye OEMol Paramers -------- mol: oechem.OEMol OpenEye OEMol fixed_hydrogens: bool If a fixed hydrogen layer should be added to the InChI, if `True` this will result in a non-standard inchi which can distinguish tautomers. Returns ------- str InChI key string of molecule """ if fixed_hydrogens: inchi_opts = oechem.OEInChIOptions() inchi_opts.SetFixedHLayer(True) inchi_key = oechem.OEMolToInChIKey(mol) else: inchi_key = oechem.OEMolToSTDInChIKey(mol) return inchi_key
def _set_SD_data(mol: oechem.OEMolBase, data: dict[str, str]) -> oechem.OEMolBase: """ Set SD data on an OpenEye OEMolBase object. Since this function works on OEMol, OEGraphMol, OEConfBase objects, it is worth repurposing. But it is not recommended to use this function directly for multi-conformer molecules. Parameters ---------- mol: oechem.OEMolBase OpenEye OEMolBase Returns ------- oechem.OEMolBase OpenEye OEMolBase with SD data set """ for key, value in data.items(): oechem.OESetSDData(mol, key, str(value)) return mol
[docs] def set_SD_data(mol: oechem.OEMol, data: dict[str, str | list]) -> oechem.OEMol: """ Set the SD data on an OpenEye OEMol, overwriting any existing data with the same tag If a str or a single-length list is passed as the values of the dictionary, the data will be set to all conformers. If a list is provided, the data will be set to the conformers in the order provided. If the list is not the same length as the number of conformers, an error will be raised. Parameters ---------- mol: oechem.OEMol OpenEye OEMol data: dict[str, str | list] Dictionary of SD data to set. Returns ------- oechem.OEMol OpenEye OEMol with SD data set """ from drugforge.data.util.data_conversion import get_first_value_of_dict_of_lists # convert to dict of lists first data = {k: v if isinstance(v, list) else [v] for k, v in data.items()} # If the object is an OEMol, we will set the SD data to all the conformers if isinstance(mol, oechem.OEMol): for key, value_list in data.items(): # if list is len 1, generate a list of len N, where N is the number of conformers if len(value_list) == 1: value_list = value_list * mol.NumConfs() if not len(value_list) == mol.NumConfs(): raise ValueError( f"Length of data for tag '{key}' does not match number of conformers ({mol.NumConfs()}). " f"Expected {mol.NumConfs()} but got {len(value_list)} elements." ) for i, conf in enumerate(mol.GetConfs()): oechem.OESetSDData(conf, key, str(value_list[i])) return mol elif isinstance(mol, oechem.OEMolBase): return _set_SD_data(mol, get_first_value_of_dict_of_lists(data)) else: raise TypeError( f"Expected an OpenEye OEMol, OEGraphMol, or OEConf, but got {type(mol)}" )
def _get_SD_data(mol: oechem.OEMolBase) -> dict[str, str]: """ Get SD data from an OpenEye OEMolBase object. Since this function works on OEMol, OEGraphMol, OEConfBase objects, it is worth repurposing. But it is not recommended to use this function directly for multi-conformer molecules. Parameters ---------- mol: oechem.OEMolBase OpenEye OEMolBase Returns ------- Dict[str, str] Dictionary of SD data """ return {dp.GetTag(): dp.GetValue() for dp in oechem.OEGetSDDataPairs(mol)}
[docs] def get_SD_data(mol: oechem.OEMolBase) -> dict[str, list]: """ Get all SD data on an OpenEye OEMol, OEGraphMol, or OEConfBase object. If multiple conformers are found, the SD tags from the conformers will be used, and any properties at the highest level will be ignored. Parameters ---------- mol: oechem.OEMol OpenEye OEMol Returns ------- dict[str, list] Dictionary of SD data Raises ------ TypeError If mol is a type that cant be converted to an OEMol, OEGraphMol, or OEConfBase """ from drugforge.data.util.data_conversion import ( get_dict_of_lists_from_dict_of_str, get_dict_of_lists_from_list_of_dicts, ) # If the object is an OEMol, we have to pull from the conformers, because even if there is only one conformer # the data is stored at the conformer level if you generate an oemol from an sdf file # However, if you've manually fiddled with the tags, the data might be stored at the molecule level. # In order to resolve this, if there is data at the high level that is not repeated at the conformer level, # we'll add to all the conformers and return that dict of lists. if isinstance(mol, oechem.OEMol): # Get the data from the molecule molecule_tags = _get_SD_data(mol) # get the data from the conformers conformer_tags = get_dict_of_lists_from_list_of_dicts( [_get_SD_data(conf) for conf in mol.GetConfs()] ) for k, v in molecule_tags.items(): if k not in conformer_tags: conformer_tags[k] = [v] * mol.NumConfs() return conformer_tags elif isinstance(mol, oechem.OEMolBase): return get_dict_of_lists_from_dict_of_str(_get_SD_data(mol)) else: raise TypeError( f"Expected an OpenEye OEMol, OEGraphMol, or OEConf, but got {type(mol)}" )
def _get_SD_data_to_object(mol: oechem.OEMol) -> dict[str, Any]: """ Get all SD data on an OpenEye OEMol, converting to Python objects Parameters ---------- mol: oechem.OEMol OpenEye OEMol Returns ------- Dict[str, Any] Dictionary of SD data """ import ast sd_data = get_SD_data(mol) for key, value in sd_data.items(): sd_data[key] = ast.literal_eval(value) return sd_data
[docs] def clear_SD_data(mol: oechem.OEMolBase) -> oechem.OEMol: """ Clear all SD data on an OpenEye OEMol Parameters ---------- mol: oechem.OEMol OpenEye OEMol Returns ------- oechem.OEMol OpenEye OEMol with SD data cleared """ oechem.OEClearSDData(mol) mol.SetTitle("") if isinstance(mol, oechem.OEMCMolBase): for conf in mol.GetConfs(): oechem.OEClearSDData(conf) return mol
[docs] def oemol_to_pdb_string(mol: oechem.OEMol) -> str: """ Dumps an OpenEye OEMol to a PDB string Parameters ---------- mol: oechem.OEMol OpenEye OEMol Returns ------- str PDB string representation of the input OEMol """ oms = oechem.oemolostream() oms.SetFormat(oechem.OEFormat_PDB) oms.SetFlavor(oechem.OEFormat_PDB, oechem.OEOFlavor_PDB_Default) oms.openstring() oechem.OEWriteMolecule(oms, mol) molstring = oms.GetString().decode("UTF-8") return molstring
[docs] def pdb_string_to_oemol(pdb_str: str) -> oechem.OEGraphMol: """ Loads a PDB string into an OpenEye OEGraphMol Parameters ---------- pdb_str: str The string representation of a PDB file Returns ------- oechem.OEMol resulting OpenEye OEMol """ ifs = oechem.oemolistream() ifs.SetFormat(oechem.OEFormat_PDB) ifs.SetFlavor( oechem.OEFormat_PDB, oechem.OEIFlavor_PDB_Default | oechem.OEIFlavor_PDB_DATA | oechem.OEIFlavor_PDB_ALTLOC, ) # noqa ifs.openstring(pdb_str) mol = oechem.OEGraphMol() if not oechem.OEReadMolecule(ifs, mol): oechem.OEThrow.Fatal("Cannot read molecule") return mol
[docs] def oedu_to_bytes64(oedu: oechem.OEDesignUnit) -> bytes: """ Convert an OpenEye DesignUnit to bytes Parameters ---------- oedu: oechem.OEDesignUnit OpenEye DesignUnit Returns ------- bytes bytes representation of the input DesignUnit encoded in base64 """ oedu_bytes = oechem.OEWriteDesignUnitToBytes(oedu) # convert to base64 return b64encode(oedu_bytes)
[docs] def bytes64_to_oedu(bytes: bytes) -> oechem.OEDesignUnit: """ Convert bytes to an OpenEye DesignUnit Parameters ---------- bytes: bytes bytes representation of a DesignUnit encoded in base64 Returns ------- oechem.OEDesignUnit resulting OpenEye DesignUnit """ # convert from base64 bytes = b64decode(bytes) du = oechem.OEDesignUnit() retcode = oechem.OEReadDesignUnitFromBytes(du, bytes) if not retcode: oechem.OEThrow.Fatal("Cannot read DesignUnit from bytes") return du
[docs] def split_openeye_mol( complex_mol, molecule_filter: Optional[Union[str, list[str], MoleculeFilter]] = None, prot_cutoff_len=10, keep_one_lig=True, ) -> dict: """ Split an OpenEye-loaded molecule into protein, ligand, etc. Uses the OpenEye OESplitMolComplex function, which automatically splits out only the first ligand binding site it sees. Parameters ---------- complex_mol : oechem.OEMolBase Complex molecule to split. molecule_filter : MoleculeFilter Molecule filter object that contains the filter criteria. prot_cutoff_len : int, default=10 Minimum number of residues in a protein chain required in order to keep keep_one_lig : bool, default=True Only keep one copy of the ligand in lig_mol Returns ------- dict A dict containing the protein ("prot"), ligand ("lig"), water atoms ("wat"), and all other atoms ("oth") """ # These are the objects that will be returned at the end lig_mol = oechem.OEGraphMol() prot_mol = oechem.OEGraphMol() water_mol = oechem.OEGraphMol() oth_mol = oechem.OEGraphMol() if molecule_filter is None: molecule_filter = MoleculeFilter() elif type(molecule_filter) is str: molecule_filter = MoleculeFilter(components_to_keep=[molecule_filter]) elif type(molecule_filter) is list: molecule_filter = MoleculeFilter(components_to_keep=molecule_filter) else: molecule_filter = molecule_filter # Make splitting split out covalent ligands possible # TODO: look into different covalent-related options here opts = oechem.OESplitMolComplexOptions() opts.SetSplitCovalent(True) opts.SetSplitCovalentCofactors(True) # Protein splitting options # Default of all atoms identified as protein + complex prot_only = oechem.OEMolComplexFilterFactory( oechem.OEMolComplexFilterCategory_ProtComplex ) # add in peptides as well, sometimes proteins are misidentified as peptides if bound or short peptide = oechem.OEMolComplexFilterFactory( oechem.OEMolComplexFilterCategory_Peptide ) # combine protein and peptide filters prot_only = oechem.OEOrRoleSet(prot_only, peptide) # If protein_chains are specified, only take protein atoms from those chains if len(molecule_filter.protein_chains) > 0: chain_filters = [ oechem.OERoleMolComplexFilterFactory( oechem.OEMolComplexChainRoleFactory(chain) ) for chain in molecule_filter.protein_chains ] chain_filter = reduce(oechem.OEOrRoleSet, chain_filters) prot_filter = oechem.OEAndRoleSet(prot_only, chain_filter) else: prot_filter = prot_only opts.SetProteinFilter(prot_filter) # Ligand splitting options # Default of all atoms identified as ligand lig_only = oechem.OEMolComplexFilterFactory( oechem.OEMolComplexFilterCategory_Ligand ) # If ligand_chain is specified, only take protein atoms from that chains if molecule_filter.ligand_chain: lig_chain = oechem.OERoleMolComplexFilterFactory( oechem.OEMolComplexChainRoleFactory(molecule_filter.ligand_chain) ) lig_filter = oechem.OEAndRoleSet(lig_only, lig_chain) else: lig_filter = lig_only # combine with NOT peptide filter lig_filter = oechem.OEAndRoleSet(lig_filter, oechem.OENotRoleSet(peptide)) opts.SetLigandFilter(lig_filter) # If water_chains are specified, set up filter for them if len(molecule_filter.water_chains) > 0: water_only = oechem.OEMolComplexFilterFactory( oechem.OEMolComplexFilterCategory_Water ) chain_filters = [ oechem.OERoleMolComplexFilterFactory( oechem.OEMolComplexChainRoleFactory(chain) ) for chain in molecule_filter.water_chains ] chain_filter = reduce(oechem.OEOrRoleSet, chain_filters) wat_filter = oechem.OEAndRoleSet(water_only, chain_filter) # combine with NOT peptide filter wat_filter = oechem.OEAndRoleSet(wat_filter, oechem.OENotRoleSet(peptide)) opts.SetWaterFilter(wat_filter) # Use python 'reduce' to combine all the filters into one, otherwise OpenEye will throw an error oechem.OESplitMolComplex( lig_mol, prot_mol, water_mol, oth_mol, complex_mol, opts, ) # TODO: make this nicer? # Get rid of any straggling extra copies of the ligand prot_mol = trim_small_chains(prot_mol, prot_cutoff_len) if molecule_filter.ligand_chain: keep_lig_chain = molecule_filter.ligand_chain else: keep_lig_chain = None # Get rid of extra copies of the ligand if keep_one_lig: all_lig_chains = [res.GetChainID() for res in oechem.OEGetResidues(lig_mol)] # Handle the case where the input has no ligand, otherwise throws an IndexError if len(all_lig_chains) > 0: # Keep first alphabetically, for reproducibility if not keep_lig_chain: keep_lig_chain = sorted(all_lig_chains)[0] for a in lig_mol.GetAtoms(): # Delete all atoms that don't match if oechem.OEAtomGetResidue(a).GetChainID() != keep_lig_chain: lig_mol.DeleteAtom(a) return { "prot": prot_mol, "lig": lig_mol, "wat": water_mol, "oth": oth_mol, "keep_lig_chain": keep_lig_chain, }
[docs] def split_openeye_design_unit(du, lig=None, lig_title=None): """ Parameters ---------- du : oechem.OEDesignUnit Design Unit to be saved lig : oechem.OEGraphMol, optional lig_title : str, optional ID of Ligand to be saved to the Title tag in the SDF, by default None include_solvent : bool, optional Whether to include solvent in the complex, by default True Returns ------- lig : oechem.OEGraphMol OE object containing ligand protein : oechem.OEGraphMol OE object containing only protein complex : oechem.OEGraphMol OE object containing ligand + protein """ prot = oechem.OEGraphMol() complex_ = oechem.OEGraphMol() du.GetProtein(prot) # if no ligand, return protein and complex with no ligand if not du.HasLigand(): return None, prot, prot if not lig: lig = oechem.OEGraphMol() du.GetLigand(lig) # Set ligand title, useful for the combined sdf file if lig_title: lig.SetTitle(f"{lig_title}") # Give ligand atoms their own chain "L" and set the resname to "LIG" residue = oechem.OEAtomGetResidue(next(iter(lig.GetAtoms()))) residue.SetChainID("L") residue.SetName("LIG") residue.SetHetAtom(True) for atom in list(lig.GetAtoms()): oechem.OEAtomSetResidue(atom, residue) # Combine protein and ligand and save # TODO: consider saving water as well oechem.OEAddMols(complex_, prot) oechem.OEAddMols(complex_, lig) # Clean up PDB info by re-perceiving, perserving chain ID, # residue number, and residue name openeye_perceive_residues(prot) openeye_perceive_residues(complex_) return lig, prot, complex_
[docs] def trim_small_chains(input_mol, cutoff_len=10): """ Remove short chains from a protein molecule object. The goal is to get rid of any peptide ligands that were mistakenly collected by OESplitMolComplex. Parameters ---------- input_mol : oechem.OEGraphMol OEGraphMol object containing the protein to trim cutoff_len : int, default=10 The cutoff length for peptide chains (chains must have more than this many residues to be included) Returns ------- oechem.OEGraphMol Trimmed molecule """ # Copy the molecule mol_copy = input_mol.CreateCopy() # Remove chains from mol_copy that are too short (possibly a better way of # doing this with OpenEye functions) # Get number of residues per chain chain_len_dict = {} hv = oechem.OEHierView(mol_copy) for chain in hv.GetChains(): chain_id = chain.GetChainID() for frag in chain.GetFragments(): frag_len = len(list(frag.GetResidues())) try: chain_len_dict[chain_id] += frag_len except KeyError: chain_len_dict[chain_id] = frag_len # Remove short chains atom by atom for a in mol_copy.GetAtoms(): chain_id = oechem.OEAtomGetResidue(a).GetExtChainID() if (chain_id not in chain_len_dict) or (chain_len_dict[chain_id] <= cutoff_len): mol_copy.DeleteAtom(a) return mol_copy