Source code for drugforge.data.services.cdd.cdd_download

import logging
from io import StringIO

import pandas

# Base CDD vault API URL
CDD_URL = "https://app.collaborativedrug.com/api/v1/vaults"
# All molecules with SMILES (public)
MOONSHOT_ALL_SMI_SEARCH = "13157856-vbatz0uAL8fhJR87pFN0tA"
# Noncovalent molecules with experimental measurements (from John)
MOONSHOT_NONCOVALENT_SMI_SEARCH = "9737468-RPSZ3XnVP-ufU6nNTJjZ_Q"
# Noncovalent with experimental measurements, including batch created date
MOONSHOT_NONCOVALENT_W_DATES_SEARCH = "11947939-KXLWU3JLbLzI354es-VKVg"

MOONSHOT_SEARCH_DICT = {
    "sars_fluorescence_all_smi": MOONSHOT_ALL_SMI_SEARCH,
    "sars_fluorescence_noncovalent_no_dates": MOONSHOT_NONCOVALENT_SMI_SEARCH,
    "sars_fluorescence_noncovalent_w_dates": MOONSHOT_NONCOVALENT_W_DATES_SEARCH,
}

# All molecules with Mac1 FRET data
ASAP_MAC1_ALL_FRET = "13002158-OsTakM3U--QoAEusMICUDA"


[docs] def download_url(search_url, header, vault=None, timeout=5000, retry_delay=10): """ Make requests to the API using the passed information. Parameters ---------- search_url : string URL for the initial GET request header : dict Header information passed to GET request. Must contain an entry for 'X-CDD-token' that gives the user's CDD API token timeout : int, default=5000 Timeout (in seconds) retry_delay : int, default=10 Delay between retry status (in seconds) Returns ------- requests.Response Response object from the final export GET request """ import sys import time import requests # If vault is not specified, attempt to parse from URL if not vault: vault = search_url.split("/")[-3] logging.debug(f"Using {vault} as vault.") # Make the initial download request logging.debug(f"download_url : initiating search {search_url}") response = requests.get(search_url, headers=header) logging.debug(f" {response}") export_id = response.json()["id"] logging.debug(f" Export id for requested search is {export_id}") # Check every `retry_delay` seconds to see if the export is ready status_url = f"{CDD_URL}/{vault}/export_progress/{export_id}" status = None total_seconds = 0 while True: logging.debug(f" checking if export is finished at {status_url}") response = requests.get(status_url, headers=header) status = response.json()["status"] if status == "finished": logging.debug(" Export is ready") break # Sleep between attempts time.sleep(retry_delay) total_seconds += retry_delay # Time out when we reach the limit if total_seconds > timeout: logging.error("Export Never Finished") break if status != "finished": logging.error( f"CDD Vault export timed out. Please check manually: {search_url}" ) sys.exit("Export failed") # Send GET request for final export result_url = f"{CDD_URL}/{vault}/exports/{export_id}" response = requests.get(result_url, headers=header) return response
# TODO: Generalize inclusion criteria to something more compact
[docs] def download_molecules( header, vault=None, search="sars_fluorescence_noncovalent_w_dates", fn_out=None, fn_cache=None, **kwargs, ): """ Download all molecules and filter based on args in `kwargs`. Saves and loads unfiltered CSV file to `fn_cache` if provided, and saves filtered CSV file to `fn_out` if provided. Parameters ---------- header : dict Header information passed to GET request. Must contain an entry for 'X-CDD-token' that gives the user's CDD API token vault : str, default=None Which CDD vault to search through. By default use the Moonshot vault search : str, default="sars_fluorescence_noncovalent_w_dates" Which entry in MOONSHOT_SEARCH_DICT to use as the search id. If the given value can't be found, assume it's the actual search id and try to download fn_out : str, optional If specified, filename to write CSV to fn_cache : str, optional If specified, file to write unfiltered CSV download to kwargs : dict Other arguments passed to filter_molecules_dataframe and parse_fluorescence_data_cdd Returns ------- pandas.DataFrame DataFrame containing compound information for all achiral molecules """ import os if fn_cache and os.path.exists(fn_cache): with open(fn_cache) as infile: content = infile.read() else: if not vault: try: vault = os.environ["MOONSHOT_CDD_VAULT_NUMBER"] except KeyError: raise ValueError("No value specified for vault.") # First try and get the search id from our known searches, otherwise assume the # given value is the search id itself try: search_id = MOONSHOT_SEARCH_DICT[search] except KeyError: logging.debug(f"Using {search} as the search id directly.") search_id = search url = f"{CDD_URL}/{vault}/searches/{search_id}" logging.debug(f"Downloading data from CDD vault from {url}") response = download_url(url, header, vault=vault) content = response.content.decode() if fn_cache: with open(fn_cache, "w") as outfile: outfile.write(content) # Parse into DF mol_df = pandas.read_csv(StringIO(content)) logging.debug(f"\n{mol_df}") # Remove chiral molecules logging.debug("Filtering dataframe...") from drugforge.data.util.utils import ( filter_molecules_dataframe, parse_fluorescence_data_cdd, ) filter_kwargs = [ "id_fieldname", "smiles_fieldname", "assay_name", "retain_achiral", "retain_racemic", "retain_enantiopure", "retain_semiquantitative_data", ] filter_kwargs = {k: kwargs[k] for k in filter_kwargs if k in kwargs} filtered_df = filter_molecules_dataframe(mol_df, **filter_kwargs) parse_kwargs = [ "keep_best_per_mol", "assay_name", "dG_T", "cp_values", "pic50_stderr_filt", ] parse_kwargs = {k: kwargs[k] for k in parse_kwargs if k in kwargs} parsed_df = parse_fluorescence_data_cdd(filtered_df, **parse_kwargs) # Save to CSV as requested if fn_out: logging.debug(f"Generating CSV file {fn_out}") parsed_df.to_csv(fn_out, index=False) return parsed_df