import logging import pathlib from typing import List, Optional from rdkit import Chem from tqdm import tqdm import gradio as gr from submission import submission import pandas as pd from configuration import GENE_EXPRESSION_METADATA logger = logging.getLogger(__name__) logger.addHandler(logging.NullHandler()) site_mapper = { "central_nervous_system": "CNS", "haematopoietic_and_lymphoid_tissue": "Haema_lymph", "upper_aerodigestive_tract": "digestive", "autonomic_ganglia": "ganglia", } def run_inference( smiles: Optional[str], smiles_path: Optional[str], omic_path: Optional[str], confidence: bool, ): # Read SMILES if not isinstance(smiles_path, (str, type(None))): raise TypeError( f"SMILES file pass has to be None or str, not {type(smiles_path)}" ) if smiles is None and smiles_path is None: raise TypeError(f"Pass either single SMILES or a file") elif smiles is not None: smiles = [smiles] elif smiles_path is not None: smiles_data = pd.read_csv(smiles_path, sep="\t", header=False) smiles = smiles_data[0] for smi in smiles: if Chem.MolFromSmiles(smi) is None: raise ValueError(f"Found invalid SMILES {smi}") # Read omics and otherwise load baseline if not isinstance(omic_path, (str, type(None))): raise TypeError(f"Omics file pass has to be None or str, not {type(omic_path)}") # ToDo: Add progress bar for multiple smiles results = {} for smi in tqdm(smiles, total=len(smiles)): result = submission( drug={"smiles": smi}, workspace_id="emulated_workspace_id", task_id="emulated_task_id", estimate_confidence=confidence, omics_file=omic_path, ) # For the moment no attention analysis result.pop("gene_attention") result.pop("smiles_attention", None) result.pop("IC50") results[f"IC50_{smi}"] = result["log_micromolar_IC50"].squeeze().round(3) results[f"IC50_{smi}"].shape if confidence: results[f"aleatoric_confidence_{smi}"] = ( result["aleatoric_confidence"].squeeze().round(3) ) results[f"epistemic_confidence_{smi}"] = ( result["aleatoric_confidence"].squeeze().round(3) ) print(results) predicted_df = pd.DataFrame(results) # Prepare DF to visualize if omic_path is None: df = GENE_EXPRESSION_METADATA print(df.columns) df.drop( [ "histology", "cell_line_name", "IC50 (min/max scaled)", "IC50 (log(μmol))", ], axis=1, inplace=True, ) df["site"] = df["site"].apply(lambda x: site_mapper.get(x, x)) df["cell_line"] = df["cell_line"].apply(lambda x: x.split("_")[0]) else: pass result_df = pd.concat( [df["cell_line"], predicted_df, df.drop(["cell_line"], axis=1)], axis=1 ) return result_df, result_df if __name__ == "__main__": # Load metadata metadata_root = pathlib.Path(__file__).parent.joinpath("model_cards") examples = pd.read_csv( metadata_root.joinpath("examples.csv"), header=None, sep="|" ).fillna("") with open(metadata_root.joinpath("article.md"), "r") as f: article = f.read() with open(metadata_root.joinpath("description.md"), "r") as f: description = f.read() demo = gr.Interface( fn=run_inference, title="PaccMann", inputs=[ gr.Textbox( label="SMILES", placeholder="COc1cc(O)c2c(c1)C=CCC(O)C(O)C(=O)C=CCC(C)OC2=O", lines=1, ), gr.File( file_types=[".smi", ".tsv"], label="List of SMILES (tab-separated file with SMILES in first column)", ), gr.File( file_types=[".csv"], label="Transcriptomics data with cell lines in rows and genes in columns", ), gr.Radio(choices=[True, False], label="Estimate confidence", value=False), ], outputs=[gr.DataFrame(label="Output"), gr.File()], article=article, description=description, # examples=examples.values.tolist(), ) demo.launch(debug=True, show_error=True)