paccmann / app.py
jannisborn's picture
update
66f9f84 unverified
raw
history blame
5.05 kB
import logging
import os
import pathlib
import tempfile
from typing import List, Optional
import gradio as gr
import pandas as pd
from rdkit import Chem
from tqdm import tqdm
from configuration import GENE_EXPRESSION_METADATA
from submission import submission
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
site_mapper = {
"central_nervous_system": "CNS",
"haematopoietic_and_lymphoid_tissue": "Haema_lymph",
"upper_aerodigestive_tract": "digestive",
"autonomic_ganglia": "ganglia",
}
def run_inference(
smiles: Optional[str],
smiles_path: Optional[str],
omic: Optional[str],
confidence: bool,
):
# Read SMILES
if smiles == "" and smiles_path is None:
raise TypeError("Pass either single SMILES or a file")
elif smiles != "" and smiles_path is not None:
raise TypeError("Pass either single SMILES or a file, not both")
elif smiles != "":
smiles = [smiles]
elif smiles_path is not None:
smiles_data = pd.read_csv(smiles_path.name, sep="\t", header=None)
smiles = smiles_data[0]
for smi in smiles:
if Chem.MolFromSmiles(smi) is None:
raise ValueError(f"Found invalid SMILES {smi}")
# Read omics and otherwise load baseline
if omic is not None:
omic_path = omic.name
else:
omic_path = None
result = pd.DataFrame({})
for smi in tqdm(smiles, total=len(smiles)):
output = submission(
drug={"smiles": smi},
workspace_id="emulated_workspace_id",
task_id="emulated_task_id",
estimate_confidence=confidence,
omics_file=omic_path,
)
# For the moment no attention analysis
output.pop("gene_attention")
output.pop("smiles_attention", None)
output.pop("IC50")
result[f"IC50_{smi}"] = output["log_micromolar_IC50"].squeeze().round(3)
if confidence:
result[f"aleatoric_confidence_{smi}"] = (
output["aleatoric_confidence"].squeeze().round(3)
)
result[f"epistemic_confidence_{smi}"] = (
output["aleatoric_confidence"].squeeze().round(3)
)
predicted_df = result
# Prepare DF to visualize
if omic_path is None:
df = GENE_EXPRESSION_METADATA.copy()
df.drop(
[
"histology",
"cell_line_name",
"IC50 (min/max scaled)",
"IC50 (log(μmol))",
],
axis=1,
inplace=True,
)
df["site"] = df["site"].apply(lambda x: site_mapper.get(x, x))
df["cell_line"] = df["cell_line"].apply(lambda x: x.split("_")[0])
if (not confidence) and "aleatoric_confidence" in df.columns:
df.drop(
["aleatoric_confidence", "epistemic_confidence"], axis=1, inplace=True
)
if (not confidence) and "aleatoric_confidence" in predicted_df.columns:
predicted_df.drop(
["aleatoric_confidence", "epistemic_confidence"], axis=1, inplace=True
)
# else:
# df = pd.read_csv(omic_path, low_memory=False)
result_df = pd.concat(
[df["cell_line"], predicted_df, df.drop(["cell_line"], axis=1)], axis=1
)
else:
result_df = predicted_df
# Save to temporary dir
temp_path = os.path.join(tempfile.gettempdir(), "paccmann_result.csv")
result_df.to_csv(temp_path)
return temp_path, result_df.head(25)
if __name__ == "__main__":
# Load metadata
metadata_root = pathlib.Path(__file__).parent.joinpath("model_cards")
examples = [
["COc1cc(O)c2c(c1)C=CCC(O)C(O)C(=O)C=CCC(C)OC2=O", None, None, False],
["COC1=C(C=C2C(=C1)N=CN=C2NC3=CC(=C(C=C3)F)Cl)OCCCN4CCOCC4", None, None, True],
# [None, metadata_root.joinpath("molecules.smi"), None, False],
]
with open(metadata_root.joinpath("article.md"), "r") as f:
article = f.read()
with open(metadata_root.joinpath("description.md"), "r") as f:
description = f.read()
demo = gr.Interface(
fn=run_inference,
title="PaccMann",
inputs=[
gr.Textbox(
label="SMILES",
placeholder="COc1cc(O)c2c(c1)C=CCC(O)C(O)C(=O)C=CCC(C)OC2=O",
lines=1,
),
gr.File(
file_types=[".smi", ".tsv"],
label="Multiple SMILES",
),
gr.File(
file_types=[".csv"],
label="Transcriptomics data file",
),
gr.Radio(choices=[True, False], label="Estimate confidence", value=False),
],
outputs=[
gr.File(label="Download full results"),
gr.DataFrame(label="Preview of results for 25 cell lines"),
],
article=article,
description=description,
examples=examples,
)
demo.launch(debug=True, show_error=True)