Source code for drugforge.data.services.cdd.cdd_api

import json
import time
from typing import Optional

import pandas
from drugforge.data.services.services_config import CDDSettings
from drugforge.data.services.web_utils import _BaseWebAPI


[docs] class CDDAPI(_BaseWebAPI): """ An interface to the CDD JSON API which allows you to search for molecules protocols and readouts like IC50. """
[docs] def __init__(self, url: str, api_version: str, api_key: str, vault: str): super().__init__(url=url, api_version=api_version, api_key=api_key) # now fix the url str self.api_url += f"/vaults/{vault}/"
[docs] @classmethod def token_name(cls) -> str: return "X-CDD-token"
@classmethod def from_settings(cls, settings: CDDSettings): return cls( url=settings.CDD_API_URL, api_version=settings.CDD_API_VERSION, api_key=settings.CDD_API_KEY, vault=settings.CDD_VAULT_NUMBER, )
[docs] def get_molecules( self, smiles: Optional[str] = None, names: Optional[list[str]] = None, compound_ids: Optional[list[int]] = None, ) -> Optional[list[dict]]: """ Search for molecules in the CDD vault. Notes: CDD only allows for a single structure searches via smiles, multiple molecules can be downloaded when using names or compound_ids. If molecule ids are missing in CDD we only return the subset that can be found Args: smiles: The smiles of the molecule to search for. names: The list of names of molecules which should be searched in the CDD. compound_ids: The list of CDD compound ids of molecules we wish to search for. Returns: A list of molecules found in the CDD. """ if len([i for i in [smiles, names, compound_ids] if i is not None]) > 1: raise ValueError( "The arguments `smiles`, `names` and `compound_ids` are mutually exclusive provide only one." ) mol_data = {"only_batch_ids": "true"} if smiles is not None: mol_data["structure"] = smiles mol_data["no_structures"] = "true" mol_data["structure_search_type"] = "exact" elif names is not None: mol_data["names"] = names mol_data["async"] = "true" else: mol_data["molecules"] = compound_ids mol_data["async"] = "true" result = json.loads( self._session.get( url=self.api_url + "molecules/", json=mol_data ).content.decode() ) # handle missing molecules, originally found when searching moonshot data if "error" in result: import re # extract the list of missing molecule ids missing_mols = [] for match in re.finditer("[0-9]+", result["error"]): missing_mols.append(int(match.group())) to_find = [mol for mol in compound_ids if mol not in missing_mols] mol_data["molecules"] = to_find # run the search again result = json.loads( self._session.get( url=self.api_url + "molecules/", json=mol_data ).content.decode() ) if "async" in mol_data: result = self.get_async_export(job_id=result["id"]) if result["count"] == 0: return None else: return result["objects"]
[docs] def get_protocols( self, protocol_names: Optional[list[str]] = None, ) -> list[dict]: """ Search for a specific protocol. Args: protocol_names: The list of protocol names to search for, if not provided all protocols will be pulled. Returns: A list of protocols associated with the given name """ protocol_data = {} if protocol_names is not None: protocol_data["names"] = protocol_names result = self._session.get(url=self.api_url + "protocols", json=protocol_data) result_data = json.loads(result.content.decode()) return result_data["objects"]
[docs] def get_readout_rows( self, protocol: int, molecule_ids: Optional[list[int]] = None, types: Optional[list[str]] = None, ) -> Optional[list[dict]]: """ Get the readout data for a specific protocol performed on a set of molecules. Args: molecule_ids: The CDD ids of the molecules to get the values for if None all molecules under this protocol will be downloaded. protocol: The id of the protocol to use in the search. types: A list of readout types to pull the results for. Returns: A dictionary of the readout data matching the search. The actual values are stored under `readouts`. """ readout_data = { "protocols": [protocol], "async": "true", # use async as we may have many results } if types is not None: readout_data["type"] = types if molecule_ids is not None: readout_data["molecules"] = molecule_ids result = self._session.get(url=self.api_url + "readout_rows", json=readout_data) request_id = json.loads(result.content.decode())["id"] result_data = self.get_async_export(job_id=request_id) if result_data["count"] == 0: return None else: return result_data["objects"]
[docs] def get_async_export(self, job_id: int) -> dict: """ A helper method to gather async request results. Args: job_id: The id of the request we want the results for. Notes: This function waits till the request is complete before returning the results. Returns: The finished request. """ done = False while not done: result = json.loads( self._session.get( url=self.api_url + f"exports/{job_id}" ).content.decode() ) if "objects" not in result: time.sleep(1) else: return result
[docs] def get_ic50_data( self, protocol_name: str ) -> Optional[ pandas.DataFrame ]: # TODO: remove duplication with the below readout method """ A convenience method which wraps the required function calls to gather the raw ic50 data from the CDD for the calculated as part of the named protocol. Args: protocol_name: The name of the protocol we want all IC50 result for. Returns: A list of dictionaries containing the IC50 values along with upper and lower CI and curve class for each batch measurement on the molecules performed as part of the given protocol. """ # get the id of the protocol we want the readout for protocols = self.get_protocols(protocol_names=[protocol_name]) if protocols: protocol = protocols[0] else: return None # define the readouts we want to find and get the ids required_data = { "IC50": None, "IC50 CI (Lower)": None, "IC50 CI (Upper)": None, "Curve class": None, } for readout_def in protocol["readout_definitions"]: if (readout_name := readout_def["name"]) in required_data: # gather the id of result for this readout required_data[readout_name] = readout_def["id"] # if any of the data is missing return if None in required_data: return None # pull down all batch readouts for this protocol and extract the data readout_data = self.get_readout_rows( protocol=protocol["id"], types=["batch_run_aggregate_row"] ) # make a list of molecules we want to pull from the CDD compound_ids = set() # extract the results linking the molecules to the extracted data ic50_data = [] for readout in readout_data: try: batch_data = { f"{protocol_name}: {key}{' (µM)' if 'IC50' in key else ''}": readout[ "readouts" ][ str(value) ][ "value" ] for key, value in required_data.items() } # add a placeholder for the molecule data to be added later batch_data["name"] = readout["molecule"] batch_data["modified_at"] = readout["modified_at"] compound_ids.add(readout["molecule"]) ic50_data.append(batch_data) except KeyError: # This is triggered if the upper and lower CI values are missing # This means the values falls outside the does series continue # gather the molecules molecule_data = self.get_molecules(compound_ids=list(compound_ids)) compounds_by_id = {molecule["id"]: molecule for molecule in molecule_data} # loop over the list again and update the molecule info final_data = [] for compound_data in ic50_data: try: mol_data = compounds_by_id[compound_data["name"]] compound_data["Smiles"] = mol_data["smiles"] compound_data["Inchi"] = mol_data["inchi"] compound_data["Inchi Key"] = mol_data["inchi_key"] compound_data["Molecule Name"] = mol_data["name"] compound_data["CXSmiles"] = mol_data["cxsmiles"] final_data.append(compound_data) except KeyError: continue return pandas.DataFrame(final_data)
def get_readout( self, protocol_name: str, readout: str ) -> Optional[pandas.DataFrame]: # get the id of the protocol we want the readout for protocols = self.get_protocols(protocol_names=[protocol_name]) if protocols: protocol = protocols[0] else: return None readout_ids = {} for readout_def in protocol["readout_definitions"]: readout_ids[readout_def["name"]] = readout_def["id"] if readout not in readout_ids: raise ValueError( f"Column {readout} not found in protocol {protocol_name}, available columns: {set(readout_ids.keys())}" ) readout_data = self.get_readout_rows(protocol=protocol["id"]) compound_ids = set() coldata = [] for readout_elem in readout_data: try: batch_data = {} batch_data[readout] = readout_elem["readouts"][ str(readout_ids[readout]) ]["value"] batch_data["name"] = readout_elem["molecule"] batch_data["modified_at"] = readout_elem["modified_at"] compound_ids.add(readout_elem["molecule"]) coldata.append(batch_data) except KeyError: continue molecule_data = self.get_molecules(compound_ids=list(compound_ids)) compounds_by_id = {molecule["id"]: molecule for molecule in molecule_data} final_data = [] for compound_data in coldata: try: mol_data = compounds_by_id[compound_data["name"]] compound_data["Smiles"] = mol_data["smiles"] compound_data["Inchi"] = mol_data["inchi"] compound_data["Inchi Key"] = mol_data["inchi_key"] compound_data["Molecule Name"] = mol_data["name"] compound_data["CXSmiles"] = mol_data["cxsmiles"] final_data.append(compound_data) except KeyError: continue return pandas.DataFrame(final_data)