Source code for drugforge.alchemy.predict

import base64
import warnings
from typing import Literal, Optional

import bokeh.models.widgets.tables
import bokeh.palettes
import bokeh.plotting
import cinnabar
import numpy as np
import pandas as pd
import panel
import plotmol
from bokeh.models import Band, ColumnDataSource, Range1d, Whisker
from cinnabar import stats
from drugforge.data.schema.ligand import Ligand
from openff.units import unit
from rdkit import Chem
from rdkit.Chem import Draw

# run to enable plotting with bokeh
panel.extension()


[docs] def get_magnitude(quantity: unit.Quantity) -> float: """ Convenience method to return the magnitude of an OpenFF unit quantity. This is useful in cases where data needs to be handled by libraries that are not able to parse pint-like quantities. Args: quantity: The value to be transformed. Returns: magnitude: The magnitude computed from the quantity. """ if type(quantity) is float: magnitude = quantity else: magnitude = quantity.magnitude return magnitude
[docs] def negative_log(value: float, inverse: bool = False) -> float: """ Convenience method to take the negative log of value, or to invert it. Handy for round tripping e.g. pIC50 and IC50. Args: value: The value to be transformed inverse: If inverse take the negative natural log of the value Returns: transformed_value: The transformed value. """ if not inverse: transformed_value = 10 ** (-1 * value) else: transformed_value = -1 * np.log(value) return transformed_value
[docs] def ki_to_dg( ki: unit.Quantity, uncertainty: unit.Quantity, temperature: unit.Quantity = 298.15 * unit.kelvin, ) -> tuple[unit.Quantity, unit.Quantity]: """ Convenience method to convert a Ki w/ a given uncertainty to an experimental estimate of the binding free energy. Args: ki: Experimental Ki value (e.g. 5 * unit.nanomolar) uncertainty: Experimental error. Note: returns 0 if =< 0 * unit.nanomolar. temperature: Experimental temperature. Default: 298.15 * unit.kelvin. Returns: dg: Gibbs binding free energy. ddg: Error in binding free energy. """ if ki > 1e-15 * unit.nanomolar: dg = ( unit.molar_gas_constant * temperature.to(unit.kelvin) * np.log(ki / unit.molar) ).to(unit.kilocalorie_per_mole) else: raise ValueError("negative Ki values are not supported") # propagate the uncertainty <https://en.wikipedia.org/wiki/Propagation_of_uncertainty> if uncertainty > 0 * unit.molar: ddg = ( unit.molar_gas_constant * temperature.to(unit.kelvin) * uncertainty / ki ).to(unit.kilocalorie_per_mole) else: ddg = 0 * unit.kilocalorie_per_mole return dg, ddg
[docs] def pic50_to_dg( pic50: float, uncertainty: float, temperature: unit.Quantity = 298.15 * unit.kelvin, ) -> tuple[unit.Quantity, unit.Quantity]: """ Convert the PIC50 value and its uncertainty to dg. Notes: We use a different function for this due to the slightly different error propagation formula. uncertainty calculated using: G = RTLn(Ki) Ki = 10^(-pK) sigma(G) = dG/dKi * sigma(Ki) sigma(G) = sigma(pK) * RT * d Ln(10^-pK) / dpK sigma(G) = sigma(pK) * RT * Ln(10) Args: pic50: The pIC50 value. uncertainty: The standard deviation in the pIC50 value temperature: Experimental temperature. Default: 298.15 * unit.kelvin. Returns: dg: Gibbs binding free energy. ddg: Error in binding free energy. """ ki = negative_log(pic50) * unit.molar dg = ( unit.molar_gas_constant * temperature.to(unit.kelvin) * np.log(ki / unit.molar) ).to(unit.kilocalorie_per_mole) ddg = ( unit.molar_gas_constant * temperature.to(unit.kelvin) * np.log(10) * uncertainty ).to(unit.kilocalorie_per_mole) return dg, ddg
[docs] def shift_and_add_prediction_error(df: pd.DataFrame, point_type: str) -> pd.DataFrame: """ Wrangles a cinnabar `FEMap` DataFrame and returns a DF ready for plotting. In case of absolute (`point_type`=='DG'), shifts the prediction values to the mean experimental value. Args: df: The dataframe we want to shift and add the prediction error to. point_type: Whether the points are absolute or relative. Can be "DG" or "DDG", resp. """ # shift if the mean is not Nan, this only happens when Nan is the only value in the column if point_type == "DG" and not np.isnan( exp_mean := df[f"{point_type} (kcal/mol) (EXPT)"].mean() ): # shift the FECS predicted values to the mean of the experimental values. df[f"{point_type} (kcal/mol) (FECS)"] = ( df[f"{point_type} (kcal/mol) (FECS)"] - df[f"{point_type} (kcal/mol) (FECS)"].mean() + exp_mean ) # calculate the prediction error. df["prediction error (kcal/mol)"] = abs( df[f"{point_type} (kcal/mol) (FECS)"] - df[f"{point_type} (kcal/mol) (EXPT)"] ) return df
[docs] def dG_to_pIC50(dG): """ Converts a DG kcal/mol unit to a pIC50 value. Note: uses Ki = pIC50 approximation (assumption). May need to implement a Cheng-Prusoff version. """ kT = 0.593 # kcal/mol for 298 K (25C) # NOTE: SHOULD WE BE TAKING ABS HERE? IS THIS ACCURATE? return abs( dG / np.log(10.0) / (-kT) ) # abs to prevent pIC50s from being negative in cases where 0<DG<1.
[docs] def dg_to_postera_dataframe(absolute_predictions: pd.DataFrame) -> pd.DataFrame: """ Given a wrangled `FEMap` dataframe of absolute predicted DG values (kcal/mol), replace 'kcal/mol' columns with pIC50 values and rename the columns to match what's expected by manifold. These names are defined in data.services.postera.manifold_data_tags.yaml Args: absolute_predictions: The dataframe of absolute DG predictions from asap-alchemy Returns: A copy of the dataframe with calculated pIC50 values rather than DGs ready for upload to postera. """ # use the expected column names from alchemy predict and convert them into the allowed column names defined in # data.services.postera.manifold_data_tags.yaml postera_df = absolute_predictions.copy(deep=True) for column, new_name in [ ("DG (kcal/mol) (FECS)", "FEC"), ("uncertainty (kcal/mol) (FECS)", "FEC-uncertainty"), ]: # replace the kcal/mol values with pIC50s. postera_df[column] = dG_to_pIC50(postera_df[column].values) # rename the column postera_df.rename(columns={column: f"computed-{new_name}-pIC50"}, inplace=True) # rename the label column to be clear in postera postera_df.rename(columns={"label": "Ligand_ID"}, inplace=True) return postera_df
[docs] def add_identifiers_to_df(dataframe: pd.DataFrame, ligands: list) -> pd.DataFrame: """ Given a wrangled DF containing either `label` (absolute) or `labelA` and `labelB` (relative), add molecule identifiers for each row. Args: dataframe: The pandas dataframe we should add the smiles to. ligands: The list of ligands from which we should extract the identifiers based on matching by name. Returns: The dataframe with added column(s) with molecule identifiers Notes: Currently adds the smiles and inchi key """ # get the identifiers for each ligand by name ligands_by_name = { ligand.compound_name: {"smiles": ligand.smiles, "inchi_key": ligand.inchikey} for ligand in ligands } # get the identifiers. Relative dataframe is a bit more involved than absolute. if "labelA" in dataframe.columns: smiles_a, smiles_b, inchi_key_a, inchi_key_b = [], [], [], [] for labelA, labelB in dataframe[["labelA", "labelB"]].values: smiles_a.append(ligands_by_name[labelA]["smiles"]) smiles_b.append(ligands_by_name[labelB]["smiles"]) inchi_key_a.append(ligands_by_name[labelA]["inchi_key"]) inchi_key_b.append(ligands_by_name[labelB]["inchi_key"]) dataframe["SMILES_A"] = smiles_a dataframe["SMILES_B"] = smiles_b dataframe["Inchi_Key_A"] = inchi_key_a dataframe["Inchi_Key_B"] = inchi_key_b elif "label" in dataframe.columns: smiles = [ ligands_by_name[label]["smiles"] for label in dataframe["label"].values ] inchi_key = [ ligands_by_name[label]["inchi_key"] for label in dataframe["label"].values ] dataframe["SMILES"] = smiles dataframe["Inchi_Key"] = inchi_key return dataframe
[docs] def extract_experimental_data( reference_csv: str, assay_units: Literal["pIC50", "IC50"] ) -> pd.DataFrame: """ Extract the experimental data from the given csv file, this assumes the csv has been downloaded from cdd. Where the molecule identifier is under column 'Molecule Name' and the experimental data is pIC50 / IC50 TODO make more general Args: reference_csv: The name of the csv file with the experimental data assay_units: The assay units of 'pIC50' or 'IC50' that the experimental data is given in. Returns: A pandas dataframe of the reference data with added columns containing the calculated binding affinity and its associated uncertainty converted to Gibbs free energy in kcal / mol. """ assay_tags = { "pIC50": ("pIC50_Mean", "pIC50_Mean Standard Deviation (±)"), "IC50": ("IC50_GMean (µM)", "IC50_GMean (µM) Standard Deviation (×/÷)"), } exp_data = pd.read_csv(reference_csv).fillna(0) # work out the columns for the ref data and the uncertainty assay_endpoint_tag, assay_endpoint_confidence_tag = None, None for col in exp_data.columns: if col.endswith(assay_tags[assay_units][0]): assay_endpoint_tag = col elif col.endswith(assay_tags[assay_units][1]): assay_endpoint_confidence_tag = col if assay_endpoint_tag is None: raise RuntimeError( f"Could not determine the assay tag from the provided units {assay_units}." ) if assay_endpoint_confidence_tag is None: warnings.warn( f"Failed to detect Standard Deviation in experimental reference file {reference_csv}." ) if assay_units == "pIC50": converter = pic50_to_dg units = unit.dimensionless # CDD IC50 uses micromolar units else: converter = ki_to_dg units = unit.micromolar experimental_affinity, experimental_affinity_error = [], [] # add the calculated affinity and error to the dataframe for _, row in exp_data.iterrows(): exp_value = row[assay_endpoint_tag] if assay_endpoint_confidence_tag: uncertainty = row[assay_endpoint_confidence_tag] else: uncertainty = 0 dg, ddg = converter(exp_value * units, uncertainty * units) experimental_affinity.append(dg.m) experimental_affinity_error.append(ddg.m) # the column names here match those made by `parse_fluorescence_data_cdd` exp_data["exp_binding_affinity_kcal_mol"] = experimental_affinity exp_data["exp_binding_affinity_kcal_mol_stderr"] = experimental_affinity_error return exp_data
def _find_ligand_data( name: str, inchi_key: str, experimental_data: pd.DataFrame ) -> dict: """ Multi search method which tries to match the name then the inchi key when looking for a molecules experimental data. Notes: Columns should have names `Molecule Name` and `Inchi Key`, if the molecule can not be found dummy data is returned. We use InchiKey when matching as this is atom order and cheminformatics toolkit independent. Args: name: The name which should be used to match the molecules inchi_key: The inchi key which should be used to match the molecules experimental_data: The experimental dataframe which should be searched for the target molecule Returns: A dictionary of the data from the dataframe which matches the provided name or inchi key """ ligand_data = experimental_data[experimental_data["Molecule Name"] == name] if len(ligand_data) < 1: ligand_data = experimental_data[experimental_data["Inchi Key"] == inchi_key] if len(ligand_data) == 1: ligand_data = ligand_data.iloc[0] else: # dummy data if not found easy to drop later ligand_data = { "exp_binding_affinity_kcal_mol": np.nan, "exp_binding_affinity_kcal_mol_stderr": np.nan, } return ligand_data
[docs] def add_absolute_expt( dataframe: pd.DataFrame, experimental_data: pd.DataFrame, ): """ Edit the dataframe inplace by adding experimental data provided to it. Args: dataframe: The dataframe of absolute free energy predictions to add the experimental data to, this should contain the name, smiles and inchi key of the molecules. experimental_data: A dataframe containing the experimental free energies in units of kcal/mol to add to the dataframe. """ experimental_col, uncertainty_col = [], [] for _, row in dataframe.iterrows(): # use the two stage lookup ligand_data = _find_ligand_data( name=row["label"], inchi_key=row["Inchi_Key"], experimental_data=experimental_data, ) experimental_col.append(ligand_data["exp_binding_affinity_kcal_mol"]) uncertainty_col.append(ligand_data["exp_binding_affinity_kcal_mol_stderr"]) dataframe["DG (kcal/mol) (EXPT)"] = experimental_col dataframe["uncertainty (kcal/mol) (EXPT)"] = uncertainty_col
[docs] def add_relative_expt( dataframe: pd.DataFrame, experimental_data: pd.DataFrame, ): """ Edit the relative dataframe in place by adding experimental data provided to it. Args: dataframe: The dataframe of relative free energy predictions to add the experimental data to. experimental_data: A dictionary of experimental free energies in units of kcal/mol to add to the dataframe. """ experimental_col, uncertainty_col = [], [] for _, row in dataframe.iterrows(): ligand_a_data = _find_ligand_data( name=row["labelA"], inchi_key=row["Inchi_Key_A"], experimental_data=experimental_data, ) ligand_b_data = _find_ligand_data( name=row["labelB"], inchi_key=row["Inchi_Key_B"], experimental_data=experimental_data, ) # compute experimental DDG for this edge ddg = ( ligand_b_data["exp_binding_affinity_kcal_mol"] - ligand_a_data["exp_binding_affinity_kcal_mol"] ) # take the average uncertainty between measurements for this edge. delta_ddg = np.mean( [ ligand_a_data["exp_binding_affinity_kcal_mol_stderr"], ligand_b_data["exp_binding_affinity_kcal_mol_stderr"], ] ) experimental_col.append(ddg) uncertainty_col.append(delta_ddg) dataframe["DDG (kcal/mol) (EXPT)"] = experimental_col dataframe["uncertainty (kcal/mol) (EXPT)"] = uncertainty_col
[docs] def get_data_from_femap( fe_map: cinnabar.FEMap, ligands: list, assay_units: Optional[str] = None, reference_dataset: Optional[str] = None, cdd_protocol: Optional[str] = None, ) -> tuple[pd.DataFrame, pd.DataFrame]: """ Given a `cinnabar` `FEMap` add the experimental reference data and generate and return: 1. a Pandas DataFrame that has all absolute predictions and measurements (DG in kcal/mol) and (pIC50) 2. a pd DF that has all relative predictions and measurements (DDG in kcal/mol) and PIC50 Args: fe_map: The cinnabar FEMap which has all calculated edges present and the absolute estimates. ligands: The list of asap ligands which are part of the network. assay_units: The units of the experimental data, which should be extracted from the reference dataset. reference_dataset: The name of the cdd csv file which contains the experimental data. cdd_protocol: The name of the CDD protocol from which we should extract experimental data. Returns: An absolute and relative free energy prediction dataframe. """ # extract the dataframes from cinnabar and format absolute_df = ( fe_map.get_absolute_dataframe() .rename( columns={ "DG (kcal/mol)": "DG (kcal/mol) (FECS)", "uncertainty (kcal/mol)": "uncertainty (kcal/mol) (FECS)", } ) .drop(["source", "computational"], axis=1) ) relative_df = ( fe_map.get_relative_dataframe() .rename( columns={ "DDG (kcal/mol)": "DDG (kcal/mol) (FECS)", "uncertainty (kcal/mol)": "uncertainty (kcal/mol) (FECS)", } ) .drop(["source", "computational"], axis=1) ) # add identifiers to each dataframe these help with matching if names fail for df in [absolute_df, relative_df]: add_identifiers_to_df(df, ligands) # add experimental data if available experimental_data = None if reference_dataset is not None: experimental_data = extract_experimental_data( reference_csv=reference_dataset, assay_units=assay_units ) elif cdd_protocol: experimental_data = download_cdd_data(protocol_name=cdd_protocol) if experimental_data is not None: add_absolute_expt(dataframe=absolute_df, experimental_data=experimental_data) add_relative_expt(dataframe=relative_df, experimental_data=experimental_data) absolute_df = shift_and_add_prediction_error(df=absolute_df, point_type="DG") relative_df = shift_and_add_prediction_error(df=relative_df, point_type="DDG") # now also add the calculated pCI50 absolute dataframe which is used . # absolute_df = dg_to_pic50_dataframe(absolute_df) # relative_df_wrangled_pic50 = dg_to_pic50_dataframe(relative_df_wrangled) return absolute_df, relative_df
[docs] def draw_mol(smiles: str) -> str: """ Create SVG text of a 2D depiction of a molecule which can be embed in an html report. Args: smiles: The smiles of the molecule which should be drawn. Returns: The SVG text of the drawing. Notes: This function will draw multi molecules side by side. """ rdkit_mol = Chem.RemoveHs(Chem.MolFromSmiles(smiles)) rdkit_mol = Draw.PrepareMolForDrawing(rdkit_mol, forceCoords=True) mols = Chem.GetMolFrags(rdkit_mol, asMols=True) if len(mols) == 2: drawer = Draw.rdMolDraw2D.MolDraw2DSVG(400, 200, 200, 200) drawer.DrawMolecules(mols) else: drawer = Draw.rdMolDraw2D.MolDraw2DSVG(400, 200) drawer.DrawMolecule(rdkit_mol) drawer.FinishDrawing() data = base64.b64encode(drawer.GetDrawingText().encode()).decode() return f'<img src="data:image/svg+xml;base64,{data}"></img>'
[docs] def plotmol_absolute( calculated: np.array, experimental: np.array, smiles: list[str], titles: Optional[list[str]] = None, calculated_uncertainty: Optional[np.array] = None, experimental_uncertainty: Optional[np.array] = None, ) -> bokeh.plotting.figure: """ Create an interactive plot using Plotmol for the absolute predictions of the free energies. Args: calculated: An array of calculated predictions. experimental: An array of experimental predictions in order of the calculated values. smiles: A list of smiles strings in order of the calculated values. titles: A list of titles in order of the calculated values. calculated_uncertainty: An optional array of calculated uncertainty values. experimental_uncertainty: An optional array of experimental uncertainty values. Returns: The bokeh interactive plot. """ # set up our own tooltip to show the title and other info custom_tooltip_template = """ <div> <div> <span>@title</span> <span><br>experimental pIC50: @experimental<br>calculated pIC50: @prediction</span> <img src="@image" ></img> </div> </div> """ figure = bokeh.plotting.figure( tooltips=custom_tooltip_template, title="Predicted affinity", x_axis_label="Experimental pIC50", y_axis_label="Calculated pIC50", width=800, height=800, ) # create the tooltip data needed for this plot type tooltip_data = { "title": titles, "experimental": experimental, "prediction": calculated, } return _plot_with_plotmol( figure=figure, calculated=calculated, experimental=experimental, smiles=smiles, calculated_uncertainty=calculated_uncertainty, experimental_uncertainty=experimental_uncertainty, custom_column_data=tooltip_data, )
def _plot_with_plotmol( figure: bokeh.plotting.figure, calculated: np.array, experimental: np.array, smiles: list[str], custom_column_data: dict[str, list], calculated_uncertainty: Optional[np.array] = None, experimental_uncertainty: Optional[np.array] = None, ) -> bokeh.plotting.figure: # set up the x and y range and plot x=y line axis_padding = 0.5 ax_min = min(min(experimental), min(calculated)) - axis_padding ax_max = max(max(experimental), max(calculated)) + axis_padding scale = Range1d(ax_min, ax_max) figure.x_range = scale figure.y_range = scale figure.line( [ax_min, ax_max], [ax_min, ax_max], line_color="black", line_dash="dashed" ) band_data = ColumnDataSource( { "x": [ax_min, ax_max], "upper_in": [ax_min + 0.5, ax_max + 0.5], "upper_out": [ax_min + 1, ax_max + 1], "lower_in": [ax_min - 0.5, ax_max - 0.5], "lower_out": [ax_min - 1, ax_max - 1], } ) in_band = Band( base="x", upper="upper_in", lower="lower_in", source=band_data, fill_alpha=0.2, fill_color="grey", ) outer_band = Band( base="x", upper="upper_out", lower="lower_out", source=band_data, fill_alpha=0.2, fill_color="grey", ) palette = bokeh.palettes.Spectral4 figure.add_layout(in_band) figure.add_layout(outer_band) # add error bars if provided if calculated_uncertainty is not None: calc_error_data = ColumnDataSource( { "x": experimental, "upper": calculated + calculated_uncertainty, "lower": calculated - calculated_uncertainty, } ) calc_error = Whisker( base="x", upper="upper", lower="lower", source=calc_error_data ) figure.add_layout(calc_error) if experimental_uncertainty is not None: exp_error_data = ColumnDataSource( { "x": calculated, "upper": experimental + experimental_uncertainty, "lower": experimental - experimental_uncertainty, } ) exp_error = Whisker( base="x", upper="upper", lower="lower", source=exp_error_data, dimension="width", ) figure.add_layout(exp_error) plotmol.scatter( figure=figure, x=experimental, y=calculated, smiles=smiles, marker="o", marker_size=15, marker_color=palette[0], custom_column_data=custom_column_data, ) figure.axis.axis_label_text_font_size = "20pt" return figure
[docs] def plotmol_relative( calculated: np.array, experimental: np.array, smiles: list[str], titles: Optional[list[tuple[str, str]]] = None, calculated_uncertainty: Optional[np.array] = None, experimental_uncertainty: Optional[np.array] = None, ) -> bokeh.plotting.figure: """ Create an interactive plot using Plotmol for the relative predictions of the free energies. Args: calculated: An array of calculated predictions. experimental: An array of experimental predictions in order of the calculated values. smiles: A list of smiles strings in order of the calculated values. titles: A list of titles in order of the calculated values. calculated_uncertainty: An optional array of calculated uncertainty values. experimental_uncertainty: An optional array of experimental uncertainty values. Returns: The bokeh interactive plot. """ # set up our own tooltip to show the title and other info custom_tooltip_template = """ <div> <div> <span>StateA:@titleA → StateB:@titleB</span> <span><br>experimental DpIC50: @experimental<br>calculated DpIC50: @prediction</span> <img src="@image" ></img> </div> </div> """ figure = bokeh.plotting.figure( tooltips=custom_tooltip_template, title="Relative prediction", x_axis_label="Experimental ΔpIC50", y_axis_label="Calculated ΔpIC50", width=800, height=800, ) titles_a, titles_b = [], [] for title_a, title_b in titles: titles_a.append(title_a) titles_b.append(title_b) # create the tooltip data needed for this plot type tooltip_data = { "titleA": titles_a, "titleB": titles_b, "experimental": experimental, "prediction": calculated, } return _plot_with_plotmol( figure=figure, calculated=calculated, experimental=experimental, smiles=smiles, calculated_uncertainty=calculated_uncertainty, experimental_uncertainty=experimental_uncertainty, custom_column_data=tooltip_data, )
[docs] def add_pic50_columns(dataframe: pd.DataFrame): """Adds pIC50 columns for all DG columns.""" pd.options.mode.chained_assignment = None # turn off annoying + useless warning for col, new_col in { "DG (kcal/mol) (FECS)": "pIC50 (FECS)", "uncertainty (kcal/mol) (FECS)": "uncertainty (pIC50) (FECS)", "DG (kcal/mol) (EXPT)": "pIC50 (EXPT)", "uncertainty (kcal/mol) (EXPT)": "uncertainty (pIC50) (EXPT)", "prediction error (kcal/mol)": "prediction error (pIC50)", "DDG (kcal/mol) (FECS)": "DpIC50 (FECS)", "DDG (kcal/mol) (EXPT)": "DpIC50 (EXPT)", }.items(): if col in dataframe.columns: dataframe[new_col] = dG_to_pIC50(dataframe[col])
[docs] def create_absolute_report(dataframe: pd.DataFrame) -> panel.Column: """ Create a cinnabar style interactive report for the dataframe of absolute free energy predictions, this dataframe should include experimental data already. Args: dataframe: The dataframe of absolute predictions to construct the report for. Returns: A panel column containing an interactive plot and table of the free energy predictions. Notes: Only plots molecules with experimental values, if no molecules have exp values only the table is created. """ number_format = bokeh.models.widgets.tables.NumberFormatter(format="0.0000") # add drawn molecule as a column mols = [draw_mol(smiles) for smiles in dataframe["SMILES"]] dataframe["Molecule"] = mols # create a plotting dataframe which drops rows with nans plotting_df = dataframe.dropna(axis=0, inplace=False) plotting_df.reset_index(inplace=True) # only make the plot if we have exp data and more than one point if len(plotting_df) > 3 and "DG (kcal/mol) (EXPT)" in plotting_df.columns: # add pIC50 columns beside DG add_pic50_columns(plotting_df) # create the DG plot fig = plotmol_absolute( calculated=plotting_df["pIC50 (FECS)"], experimental=plotting_df["pIC50 (EXPT)"], smiles=plotting_df["SMILES"], titles=plotting_df["label"], calculated_uncertainty=plotting_df["uncertainty (pIC50) (FECS)"], experimental_uncertainty=plotting_df["uncertainty (pIC50) (EXPT)"], ) # calculate the bootstrapped stats using cinnabar stats_data = [] for statistic in ["RMSE", "MUE", "R2", "rho"]: s = stats.bootstrap_statistic( plotting_df["pIC50 (EXPT)"], plotting_df["pIC50 (FECS)"], plotting_df["uncertainty (pIC50) (EXPT)"], plotting_df["uncertainty (pIC50) (FECS)"], statistic=statistic, include_true_uncertainty=False, include_pred_uncertainty=False, ) stats_data.append( { "Statistic": statistic, "value": s["mle"], "lower bound": s["low"], "upper bound": s["high"], } ) stats_df = pd.DataFrame(stats_data) # create a format for numerical data in the tables stats_format = {col: number_format for col in stats_df.columns} stats_format["Statistic"] = "html" else: stats_df, fig = None, None # construct the report layout = panel.Column(sizing_mode="stretch_width", scroll=True) if stats_df is not None and fig is not None: row_layout = panel.Row( panel.pane.Bokeh(fig), ( panel.widgets.Tabulator( stats_df, show_index=False, selectable=False, disabled=True, formatters=stats_format, configuration={"columnDefaults": {"headerSort": False}}, ) ), ) layout.append(row_layout) # add the table layout.append( panel.widgets.Tabulator( # use full data frame including nans for table dataframe, show_index=False, disabled=True, formatters={ "SMILES": "html", "Molecule": "html", "pIC50 (FECS)": number_format, "uncertainty (pIC50) (FECS)": number_format, "pIC50 (EXPT)": number_format, "uncertainty (pIC50) (EXPT)": number_format, "prediction error (pIC50)": number_format, }, configuration={"rowHeight": 300}, ) ) return layout
[docs] def create_relative_report(dataframe: pd.DataFrame) -> panel.Column: """ Create a cinnabar style interactive report for the dataframe of relative free energy predictions, this dataframe should include experimental data already. Args: dataframe: The dataframe of relative predictions to construct the report for. Returns: A panel column containing an interactive plot and table of the free energy predictions if we have 2 or more exp data points else only a table is created. """ mols, combined_smiles, titles = [], [], [] for _, data in dataframe.iterrows(): smiles = ".".join([data["SMILES_A"], data["SMILES_B"]]) combined_smiles.append(smiles) mols.append(draw_mol(smiles=smiles)) titles.append((data["labelA"], data["labelB"])) dataframe["Molecules"] = mols dataframe["labels"] = titles dataframe["smiles"] = combined_smiles # create a plotting dataframe which drops rows with nans plotting_df = dataframe.dropna(axis=0, inplace=False) plotting_df.reset_index(inplace=True) # add pIC50 columns beside DG add_pic50_columns(plotting_df) number_format = bokeh.models.widgets.tables.NumberFormatter(format="0.0000") # only plot the graph if we have exp data and more than a single point make_plots_stats = ( len(plotting_df) > 3 and "DDG (kcal/mol) (EXPT)" in plotting_df.columns ) if make_plots_stats: # create the DDG plot fig = plotmol_relative( calculated=plotting_df["DpIC50 (FECS)"], experimental=plotting_df["DpIC50 (EXPT)"], smiles=plotting_df["smiles"], titles=plotting_df["labels"], calculated_uncertainty=plotting_df["uncertainty (pIC50) (FECS)"], experimental_uncertainty=plotting_df["uncertainty (pIC50) (EXPT)"], ) # calculate the bootstrapped stats using cinnabar stats_data = [] for statistic in ["RMSE", "MUE", "R2", "rho"]: s = stats.bootstrap_statistic( plotting_df["DpIC50 (EXPT)"], plotting_df["DpIC50 (FECS)"], plotting_df["uncertainty (pIC50) (EXPT)"], plotting_df["uncertainty (pIC50) (FECS)"], statistic=statistic, include_true_uncertainty=False, include_pred_uncertainty=False, ) stats_data.append( { "Statistic": statistic, "value": s["mle"], "lower bound": s["low"], "upper bound": s["high"], } ) stats_df = pd.DataFrame(stats_data) # create a format for numerical data in the tables stats_format = {col: number_format for col in stats_df.columns} stats_format["Statistic"] = "html" # construct the report layout = panel.Column( panel.Row( panel.pane.Bokeh(fig) if make_plots_stats else None, ( panel.widgets.Tabulator( stats_df, show_index=False, selectable=False, disabled=True, formatters=stats_format, configuration={"columnDefaults": {"headerSort": False}}, ) if make_plots_stats else None ), ), panel.widgets.Tabulator( dataframe.drop(columns=["labels", "smiles"]), show_index=False, disabled=True, formatters={ "SMILES_A": "html", "SMILES_B": "html", "Molecules": "html", "DpIC50 (FECS)": number_format, "uncertainty (pIC50) (FECS)": number_format, "DpIC50 (EXPT)": number_format, "uncertainty (pIC50) (EXPT)": number_format, "prediction error (pIC50)": number_format, }, configuration={"rowHeight": 300}, ), sizing_mode="stretch_width", scroll=True, ) return layout
[docs] def download_cdd_data(protocol_name: str) -> pd.DataFrame: """ A wrapper method to download CDD protocol data, mainly used to tuck imports. Args: protocol_name: The name of the CDD protocol to extract experimental data for. Returns: A dataframe of the extracted and formatted experimental data. """ from drugforge.data.services.cdd.cdd_api import CDDAPI from drugforge.data.services.services_config import CDDSettings from drugforge.data.util.utils import parse_fluorescence_data_cdd settings = CDDSettings() cdd_api = CDDAPI.from_settings(settings=settings) ic50_data = cdd_api.get_ic50_data(protocol_name=protocol_name) # format the data to add the pIC50 and error formatted_data = parse_fluorescence_data_cdd( mol_df=ic50_data, assay_name=protocol_name ) return formatted_data
[docs] def clean_result_network(network, console=None, ddg_outlier_threshold=15): """ Cleans an incoming result network JSON file from some issues that might occur. Current procedures: - removes edges that have DG==0.0, this happens when there is inconsistent stereo annotation in input ligands such that after stereo enumeration there are duplicate ligands. - cleans imbalanced complex/solvent legs, e.g. when some have failed or when stereo expansion was done unintentionally. Averages duplicate legs to come to a single value. - removes edges that have erroneously large DDGs (anything absolute above ddg_outlier_threshold in kcal/mol) returns the loaded FreeEnergyCalculationNetwork. """ import math from collections import defaultdict import numpy as np from drugforge.alchemy.schema.fec import ( AlchemiscaleResults, FreeEnergyCalculationNetwork, TransformationResult, ) from rich.padding import Padding # load in to schema and extract the results network_schema = FreeEnergyCalculationNetwork.from_file(network) input_results = network_schema.results.results # 1. remove edges where DG is 0.0 cleaned_results = [ result for result in input_results if not result.estimate.magnitude == 0.0 ] num_0_0_removed = len(input_results) - len(cleaned_results) # 2. balance between complex/solvent replicates, such that n=N=1 deduped_results_dict = defaultdict(list) for result in cleaned_results: transform = f"{result.ligand_a}~{result.ligand_b}_{result.phase}" deduped_results_dict[transform].append(result) deduped_results = [] for _, results in deduped_results_dict.items(): if len(results) > 1: # take the arithmetic mean of DG and dDG and add the replaced first result, # all provenance data is constant between these repeats anyway mean_DG = ( np.mean([result.estimate.magnitude for result in results]) * unit.kilocalorie_per_mole ) # FIXME: Uncertainty of the mean is not mean of uncertainties. This is probably incorrect. mean_dDG = ( np.mean([result.uncertainty.magnitude for result in results]) * unit.kilocalorie_per_mole ) result_data = results[0].model_dump(exclude={"estimate", "uncertainty"}) tf_res = TransformationResult( estimate=mean_DG, uncertainty=mean_dDG, **result_data ) else: tf_res = results[0] deduped_results.append(tf_res) num_dupes_removed = len(cleaned_results) - len(deduped_results) # remove predictions that have a NaN as either prediction or unc - this is extremely rare denand_results = [] for result in deduped_results: if not math.isnan(result.estimate.magnitude) and not math.isnan( result.uncertainty.magnitude ): denand_results.append(result) # remove edges that have erroneously high DDG values results_complex = [] results_solvent = [] for edge_result in denand_results: if edge_result.phase == "complex": results_complex.append(edge_result) elif edge_result.phase == "solvent": results_solvent.append(edge_result) else: raise ValueError( f"Edge phase {edge_result.phase} not recognized for edge {edge_result}" ) results_not_overly_large = [] large_edge_removal_counter = 0 for res_complex in results_complex: # find the solvent match for res_solvent in results_solvent: if ( res_complex.ligand_a == res_solvent.ligand_a and res_complex.ligand_b == res_solvent.ligand_b ): # only add back in if the edge's DDG is below the outlier threshold if ( abs(res_solvent.estimate.magnitude - res_complex.estimate.magnitude) < ddg_outlier_threshold ): results_not_overly_large.append(res_solvent) results_not_overly_large.append(res_complex) else: large_edge_removal_counter += 1 # done! let's repack everything. if console: message = Padding( f"Cleaned incoming result network:\n- Removed {num_0_0_removed} edge(s) with DG==0.0 kcal/mol\n- Removed {num_dupes_removed} edge(s) to balance between complex/solvent replicates.\n- Removed {len(deduped_results)-len(denand_results)} edge(s) that contained a NaN measurement\n- Removed {large_edge_removal_counter} edges that have an abs(DDG) of more than {ddg_outlier_threshold} kcal/mol", (1, 0, 1, 0), ) console.print(message) data = network_schema.model_dump(exclude={"results"}) # unpack the deduped results into dicts results = AlchemiscaleResults( results=results_not_overly_large, network_key=network_schema.results.network_key ).model_dump() data["results"] = results fec = FreeEnergyCalculationNetwork.model_validate(data) return fec
[docs] def get_top_n_poses( absolute_df, ligands, top_n, console=False, write_file=True ) -> list[Ligand]: """ Takes the `top_n` number of ligands from the FE predictions and creates a list of `Ligand` objects. If specified, will write a multi-SDF file of those ligands into the local directory while logging this. """ from drugforge.data.schema.ligand import write_ligands_to_multi_sdf from rich.padding import Padding # get a dict of ligands so we can more easily grab them by name ligands_dict = {ligand.compound_name: ligand for ligand in ligands} # write out each compound of the top n to the multi-SDF top_n_ligands = [] if top_n > len(absolute_df): # cap the slice to the max number of predictions top_n = len(absolute_df) docked_hits_path = f"top_{top_n}_posed_ligands.sdf" for compound_name in absolute_df.sort_values(by="DG (kcal/mol) (FECS)")["label"][ :top_n ]: top_n_ligands.append(ligands_dict[compound_name]) if write_file: write_ligands_to_multi_sdf(docked_hits_path, top_n_ligands, overwrite=True) if console: message = Padding( f"Top {top_n} compound poses written to [repr.filename]{docked_hits_path}[/repr.filename]", (1, 0, 1, 0), ) console.print(message) return top_n_ligands