Source code for drugforge.data.services.cdd.cdd_download

import logging
from io import StringIO

import pandas

# Base CDD vault API URL
CDD_URL = "https://app.collaborativedrug.com/api/v1/vaults"
# All molecules with SMILES (public)
MOONSHOT_ALL_SMI_SEARCH = "13157856-vbatz0uAL8fhJR87pFN0tA"
# Noncovalent molecules with experimental measurements (from John)
MOONSHOT_NONCOVALENT_SMI_SEARCH = "9737468-RPSZ3XnVP-ufU6nNTJjZ_Q"
# Noncovalent with experimental measurements, including batch created date
MOONSHOT_NONCOVALENT_W_DATES_SEARCH = "11947939-KXLWU3JLbLzI354es-VKVg"

MOONSHOT_SEARCH_DICT = {
    "sars_fluorescence_all_smi": MOONSHOT_ALL_SMI_SEARCH,
    "sars_fluorescence_noncovalent_no_dates": MOONSHOT_NONCOVALENT_SMI_SEARCH,
    "sars_fluorescence_noncovalent_w_dates": MOONSHOT_NONCOVALENT_W_DATES_SEARCH,
}

# All molecules with Mac1 FRET data
ASAP_MAC1_ALL_FRET = "13002158-OsTakM3U--QoAEusMICUDA"



[docs]
def download_url(search_url, header, vault=None, timeout=5000, retry_delay=10):
    """
    Make requests to the API using the passed information.

    Parameters
    ----------
    search_url : string
        URL for the initial GET request
    header : dict
        Header information passed to GET request. Must contain an entry for
        'X-CDD-token' that gives the user's CDD API token
    timeout : int, default=5000
        Timeout (in seconds)
    retry_delay : int, default=10
        Delay between retry status (in seconds)

    Returns
    -------
    requests.Response
        Response object from the final export GET request
    """
    import sys
    import time

    import requests

    # If vault is not specified, attempt to parse from URL
    if not vault:
        vault = search_url.split("/")[-3]
        logging.debug(f"Using {vault} as vault.")

    # Make the initial download request
    logging.debug(f"download_url : initiating search {search_url}")
    response = requests.get(search_url, headers=header)
    logging.debug(f"  {response}")
    export_id = response.json()["id"]
    logging.debug(f"  Export id for requested search is {export_id}")

    # Check every `retry_delay` seconds to see if the export is ready
    status_url = f"{CDD_URL}/{vault}/export_progress/{export_id}"
    status = None
    total_seconds = 0
    while True:
        logging.debug(f"  checking if export is finished at {status_url}")
        response = requests.get(status_url, headers=header)
        status = response.json()["status"]

        if status == "finished":
            logging.debug("  Export is ready")
            break

        # Sleep between attempts
        time.sleep(retry_delay)
        total_seconds += retry_delay

        # Time out when we reach the limit
        if total_seconds > timeout:
            logging.error("Export Never Finished")
            break

    if status != "finished":
        logging.error(
            f"CDD Vault export timed out. Please check manually: {search_url}"
        )
        sys.exit("Export failed")

    # Send GET request for final export
    result_url = f"{CDD_URL}/{vault}/exports/{export_id}"
    response = requests.get(result_url, headers=header)

    return response



# TODO: Generalize inclusion criteria to something more compact

[docs]
def download_molecules(
    header,
    vault=None,
    search="sars_fluorescence_noncovalent_w_dates",
    fn_out=None,
    fn_cache=None,
    **kwargs,
):
    """
    Download all molecules and filter based on args in `kwargs`. Saves
    and loads unfiltered CSV file to `fn_cache` if provided, and saves filtered
    CSV file to `fn_out` if provided.

    Parameters
    ----------
    header : dict
        Header information passed to GET request. Must contain an entry for
        'X-CDD-token' that gives the user's CDD API token
    vault : str, default=None
        Which CDD vault to search through. By default use the Moonshot vault
    search : str, default="sars_fluorescence_noncovalent_w_dates"
        Which entry in MOONSHOT_SEARCH_DICT to use as the search id. If the given value
        can't be found, assume it's the actual search id and try to download
    fn_out : str, optional
        If specified, filename to write CSV to
    fn_cache : str, optional
        If specified, file to write unfiltered CSV download to
    kwargs : dict
        Other arguments passed to filter_molecules_dataframe and
        parse_fluorescence_data_cdd

    Returns
    -------
    pandas.DataFrame
        DataFrame containing compound information for all achiral molecules
    """
    import os

    if fn_cache and os.path.exists(fn_cache):
        with open(fn_cache) as infile:
            content = infile.read()
    else:
        if not vault:
            try:
                vault = os.environ["MOONSHOT_CDD_VAULT_NUMBER"]
            except KeyError:
                raise ValueError("No value specified for vault.")
        # First try and get the search id from our known searches, otherwise assume the
        #  given value is the search id itself
        try:
            search_id = MOONSHOT_SEARCH_DICT[search]
        except KeyError:
            logging.debug(f"Using {search} as the search id directly.")
            search_id = search
        url = f"{CDD_URL}/{vault}/searches/{search_id}"
        logging.debug(f"Downloading data from CDD vault from {url}")
        response = download_url(url, header, vault=vault)
        content = response.content.decode()

        if fn_cache:
            with open(fn_cache, "w") as outfile:
                outfile.write(content)

    # Parse into DF
    mol_df = pandas.read_csv(StringIO(content))
    logging.debug(f"\n{mol_df}")

    # Remove chiral molecules
    logging.debug("Filtering dataframe...")
    from drugforge.data.util.utils import (
        filter_molecules_dataframe,
        parse_fluorescence_data_cdd,
    )

    filter_kwargs = [
        "id_fieldname",
        "smiles_fieldname",
        "assay_name",
        "retain_achiral",
        "retain_racemic",
        "retain_enantiopure",
        "retain_semiquantitative_data",
    ]
    filter_kwargs = {k: kwargs[k] for k in filter_kwargs if k in kwargs}
    filtered_df = filter_molecules_dataframe(mol_df, **filter_kwargs)
    parse_kwargs = [
        "keep_best_per_mol",
        "assay_name",
        "dG_T",
        "cp_values",
        "pic50_stderr_filt",
    ]
    parse_kwargs = {k: kwargs[k] for k in parse_kwargs if k in kwargs}
    parsed_df = parse_fluorescence_data_cdd(filtered_df, **parse_kwargs)

    # Save to CSV as requested
    if fn_out:
        logging.debug(f"Generating CSV file {fn_out}")
        parsed_df.to_csv(fn_out, index=False)

    return parsed_df