Spaces:
Running
Running
File size: 4,631 Bytes
ec53722 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
"""Submission-related utilities."""
import os
import json
import logging
import numpy as np
import pandas as pd
from io import StringIO
from typing import Optional
from sklearn.preprocessing import StandardScaler
from configuration import (
GENE_EXPRESSION_DATA,
GENE_EXPRESSION_METADATA,
GENES,
GENE_STANDARDIZATION_PARAMETERS,
)
from cos import RESULTS_PREFIX, string_to_key
from forward import predict
# from attention import upload_attention
logger = logging.getLogger("openapi_server:submission")
def submission(
drug: dict,
workspace_id: str,
task_id: str,
estimate_confidence: bool = False,
omics_file: Optional[str] = None,
) -> None:
"""
Submit PaccMann prediction
Args:
drug (dict): drug to analyse in dictionary format.
workspace_id (str): workspace identifier for the submission.
task_id (str): task identifier.
estimate_confidence (bool, optional): estimate confidence of the
prediction. Defaults to False.
omics_file (Optional[str], optional): binary string containing
expression data. Defaults to None.
"""
prefix = os.path.join(RESULTS_PREFIX, workspace_id, task_id)
logger.debug("processing omic data.")
# NOTE: this trick is used in case a single example is passed
single_example = False
result = {}
if omics_file is None:
gene_expression, gene_expression_metadata = (
GENE_EXPRESSION_DATA,
GENE_EXPRESSION_METADATA,
)
else:
logger.debug("parsing uploaded omic data.")
logger.debug(omics_file)
gene_expression_df = pd.read_csv(omics_file, low_memory=False)
logger.debug(gene_expression_df.columns)
to_drop = list(set(GENES) & set(gene_expression_df.columns))
gene_expression_data, gene_expression_metadata = (
gene_expression_df.T.reindex(GENES).fillna(0.0).T,
gene_expression_df.drop(to_drop, axis=1),
)
logger.debug("peek parsed expression and metadata.")
logger.debug("gene_expression_data:\n{}".format(gene_expression_data.head()))
logger.debug(
"gene_expression_metadata:\n{}".format(gene_expression_metadata.head())
)
if gene_expression_data.shape[0] < 2:
logger.debug(
"single example, standardizing with default parameters:\n{}".format(
GENE_STANDARDIZATION_PARAMETERS
)
)
single_example = True
gene_expression = (
gene_expression_data.values - GENE_STANDARDIZATION_PARAMETERS[0]
) / GENE_STANDARDIZATION_PARAMETERS[1]
gene_expression = np.vstack(2 * [gene_expression])
logger.debug(gene_expression.shape)
else:
gene_expression = StandardScaler().fit_transform(
gene_expression_data.values
)
logger.debug("gene_expression:\n{}".format(gene_expression[:10]))
logger.debug("omic data prepared if present.")
prediction_dict = predict(
smiles=drug["smiles"],
gene_expression=gene_expression,
estimate_confidence=estimate_confidence,
)
# from tensors
for key, value in prediction_dict.items():
prediction_dict[key] = value.numpy()[:1] if single_example else value.numpy()
result.update(prediction_dict)
# merge for single table, index is unique identifier for samples.
gene_expression_metadata["IC50 (min/max scaled)"] = prediction_dict["IC50"]
gene_expression_metadata["IC50 (log(μmol))"] = prediction_dict[
"log_micromolar_IC50"
]
if estimate_confidence:
gene_expression_metadata["epistemic_confidence"] = prediction_dict[
"epistemic_confidence"
]
gene_expression_metadata["aleatoric_confidence"] = prediction_dict[
"aleatoric_confidence"
]
logger.debug("uploaded predicted sensitivity table including metadata.")
# attention
# result.update(
# upload_attention(
# prefix,
# sample_names=list(map(str, gene_expression_metadata.index)),
# omic_attention=prediction_dict["gene_attention"],
# smiles_attention=prediction_dict["smiles_attention"],
# )
# )
logger.debug("uploaded attention for each sample.")
logger.debug("uploading drug information and sensitivity.")
# prediction (is sensitivity_json in API)
logger.debug("uploaded drug information and sensitivity.")
# NOTE: Ordering corresponds to IDs in GEP metadata!
return result
|