Spaces:
Running
Running
File size: 2,856 Bytes
ec53722 30c86cf ec53722 30c86cf ec53722 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
"""Configuration utils."""
import os
import json
import dill
import pandas as pd
from pytoda.transforms import Compose
from pytoda.smiles.transforms import SMILESToTokenIndexes, LeftPadding, Canonicalization
from cos import ensure_filepath_from_uri, COS_BUCKET_URI
# model files
MODEL_WEIGHTS_URI = ensure_filepath_from_uri(os.path.join(COS_BUCKET_URI, "model.pt"))
MODEL_PARAMS_URI = ensure_filepath_from_uri(os.path.join(COS_BUCKET_URI, "model.json"))
# SMILES language file
SMILES_LANGUAGE_URI = ensure_filepath_from_uri(
os.path.join(COS_BUCKET_URI, "smiles_language.pkl")
)
# gene expression file
GENE_EXPRESSION_URI = ensure_filepath_from_uri(
os.path.join(COS_BUCKET_URI, "gene_expression.csv.zip")
)
# genes file
GENES_URI = ensure_filepath_from_uri(os.path.join(COS_BUCKET_URI, "genes.pkl"))
# genes standardization parameters
GENE_EXPRESSION_STANDARDIZATION_URI = ensure_filepath_from_uri(
os.path.join(COS_BUCKET_URI, "gene_expression_standardization.pkl")
)
# load the model
with open(MODEL_PARAMS_URI) as fp:
MODEL_PARAMS = json.load(fp)
MAX_LENGTH = MODEL_PARAMS["smiles_padding_length"]
# load SMILES language
with open(SMILES_LANGUAGE_URI, "rb") as fp:
SMILES_LANGUAGE = dill.load(fp)
# load gene expression
GENE_EXPRESSION = pd.read_csv(GENE_EXPRESSION_URI, compression="zip", low_memory=False)
# load genes
with open(GENES_URI, "rb") as fp:
GENES = dill.load(fp)
# load gene standardization parameters
with open(GENE_EXPRESSION_STANDARDIZATION_URI, "rb") as fp:
GENE_STANDARDIZATION_PARAMETERS = dill.load(fp)
# smiles transformations
SMILES_TRANSFORMS = [
Canonicalization(),
SMILESToTokenIndexes(smiles_language=SMILES_LANGUAGE),
LeftPadding(padding_length=MAX_LENGTH, padding_index=SMILES_LANGUAGE.padding_index),
]
SMILES_TOKENIZE_FN = Compose(SMILES_TRANSFORMS)
# prepare default gene expression data
# NOTE: transpose and reset work around to ensure we have all needed genes
GENE_EXPRESSION_DATA = GENE_EXPRESSION.T.reindex(GENES).fillna(0.0).T.values
# NOTE: sub-selecting exisiting columns to remove all the genes
to_drop = list(set(GENES) & set(GENE_EXPRESSION.columns))
GENE_EXPRESSION_METADATA = GENE_EXPRESSION.drop(to_drop, axis=1)
del GENE_EXPRESSION
# housekeeping
RESULTS_EXPIRATION_SECONDS = float(
os.environ.get(
"PACCMANN_RESULTS_EXPIRATION_SECONDS",
# every week
60 * 60 * 24 * 7,
)
)
# SMILES parameters
# TODO: think whether we should enforce canonicalization
CANON = {
"canonical": MODEL_PARAMS["canonical"],
"kekulize": MODEL_PARAMS["kekulize"],
"all_bonds_explicit": MODEL_PARAMS["all_bonds_explicit"],
"all_hs_explicit": MODEL_PARAMS["all_hs_explicit"],
"randomize": MODEL_PARAMS["randomize"],
"remove_bonddir": MODEL_PARAMS["remove_bonddir"],
"smiles_maximum_length": MODEL_PARAMS["smiles_padding_length"],
}
|