Spaces:
Running
Running
import logging | |
import os | |
import pathlib | |
import tempfile | |
from typing import List, Optional | |
import gradio as gr | |
import pandas as pd | |
from rdkit import Chem | |
from tqdm import tqdm | |
from configuration import GENE_EXPRESSION_METADATA | |
from submission import submission | |
logger = logging.getLogger(__name__) | |
logger.addHandler(logging.NullHandler()) | |
site_mapper = { | |
"central_nervous_system": "CNS", | |
"haematopoietic_and_lymphoid_tissue": "Haema_lymph", | |
"upper_aerodigestive_tract": "digestive", | |
"autonomic_ganglia": "ganglia", | |
} | |
def run_inference( | |
smiles: Optional[str], | |
smiles_path: Optional[str], | |
omic: Optional[str], | |
confidence: bool, | |
): | |
# Read SMILES | |
if smiles == "" and smiles_path is None: | |
raise TypeError("Pass either single SMILES or a file") | |
elif smiles != "" and smiles_path is not None: | |
raise TypeError("Pass either single SMILES or a file, not both") | |
elif smiles != "": | |
smiles = [smiles] | |
elif smiles_path is not None: | |
smiles_data = pd.read_csv(smiles_path.name, sep="\t", header=None) | |
smiles = smiles_data[0] | |
for smi in smiles: | |
if Chem.MolFromSmiles(smi) is None: | |
raise ValueError(f"Found invalid SMILES {smi}") | |
# Read omics and otherwise load baseline | |
if omic is not None: | |
omic_path = omic.name | |
else: | |
omic_path = None | |
result = pd.DataFrame({}) | |
for smi in tqdm(smiles, total=len(smiles)): | |
output = submission( | |
drug={"smiles": smi}, | |
workspace_id="emulated_workspace_id", | |
task_id="emulated_task_id", | |
estimate_confidence=confidence, | |
omics_file=omic_path, | |
) | |
# For the moment no attention analysis | |
output.pop("gene_attention") | |
output.pop("smiles_attention", None) | |
output.pop("IC50") | |
result[f"IC50_{smi}"] = output["log_micromolar_IC50"].squeeze().round(3) | |
if confidence: | |
result[f"aleatoric_confidence_{smi}"] = ( | |
output["aleatoric_confidence"].squeeze().round(3) | |
) | |
result[f"epistemic_confidence_{smi}"] = ( | |
output["aleatoric_confidence"].squeeze().round(3) | |
) | |
predicted_df = result | |
# Prepare DF to visualize | |
if omic_path is None: | |
df = GENE_EXPRESSION_METADATA.copy() | |
df.drop( | |
[ | |
"histology", | |
"cell_line_name", | |
"IC50 (min/max scaled)", | |
"IC50 (log(μmol))", | |
], | |
axis=1, | |
inplace=True, | |
) | |
df["site"] = df["site"].apply(lambda x: site_mapper.get(x, x)) | |
df["cell_line"] = df["cell_line"].apply(lambda x: x.split("_")[0]) | |
if (not confidence) and "aleatoric_confidence" in df.columns: | |
df.drop( | |
["aleatoric_confidence", "epistemic_confidence"], axis=1, inplace=True | |
) | |
if (not confidence) and "aleatoric_confidence" in predicted_df.columns: | |
predicted_df.drop( | |
["aleatoric_confidence", "epistemic_confidence"], axis=1, inplace=True | |
) | |
# else: | |
# df = pd.read_csv(omic_path, low_memory=False) | |
result_df = pd.concat( | |
[df["cell_line"], predicted_df, df.drop(["cell_line"], axis=1)], axis=1 | |
) | |
else: | |
result_df = predicted_df | |
# Save to temporary dir | |
temp_path = os.path.join(tempfile.gettempdir(), "paccmann_result.csv") | |
result_df.to_csv(temp_path) | |
return temp_path, result_df.head(25) | |
if __name__ == "__main__": | |
# Load metadata | |
metadata_root = pathlib.Path(__file__).parent.joinpath("model_cards") | |
examples = [ | |
["COc1cc(O)c2c(c1)C=CCC(O)C(O)C(=O)C=CCC(C)OC2=O", None, None, False], | |
["COC1=C(C=C2C(=C1)N=CN=C2NC3=CC(=C(C=C3)F)Cl)OCCCN4CCOCC4", None, None, True], | |
# [None, metadata_root.joinpath("molecules.smi"), None, False], | |
] | |
with open(metadata_root.joinpath("article.md"), "r") as f: | |
article = f.read() | |
with open(metadata_root.joinpath("description.md"), "r") as f: | |
description = f.read() | |
demo = gr.Interface( | |
fn=run_inference, | |
title="PaccMann", | |
inputs=[ | |
gr.Textbox( | |
label="SMILES", | |
placeholder="COc1cc(O)c2c(c1)C=CCC(O)C(O)C(=O)C=CCC(C)OC2=O", | |
lines=1, | |
), | |
gr.File( | |
file_types=[".smi", ".tsv"], | |
label="Multiple SMILES", | |
), | |
gr.File( | |
file_types=[".csv"], | |
label="Transcriptomics data file", | |
), | |
gr.Radio(choices=[True, False], label="Estimate confidence", value=False), | |
], | |
outputs=[ | |
gr.File(label="Download full results"), | |
gr.DataFrame(label="Preview of results for 25 cell lines"), | |
], | |
article=article, | |
description=description, | |
examples=examples, | |
) | |
demo.launch(debug=True, show_error=True) | |