Spaces:
Running
Running
"""Configuration utils.""" | |
import os | |
import json | |
import pandas as pd | |
from pytoda.transforms import Compose | |
from pytoda.smiles.transforms import SMILESToTokenIndexes, LeftPadding, Canonicalization | |
from cos import ensure_filepath_from_uri, COS_BUCKET_URI | |
from utils import load | |
# model files | |
MODEL_WEIGHTS_URI = ensure_filepath_from_uri(os.path.join(COS_BUCKET_URI, "model.pt")) | |
MODEL_PARAMS_URI = ensure_filepath_from_uri(os.path.join(COS_BUCKET_URI, "model.json")) | |
# SMILES language file | |
SMILES_LANGUAGE_URI = ensure_filepath_from_uri( | |
os.path.join(COS_BUCKET_URI, "smiles_language.pkl") | |
) | |
# gene expression file | |
GENE_EXPRESSION_URI = ensure_filepath_from_uri( | |
os.path.join(COS_BUCKET_URI, "gene_expression.csv.zip") | |
) | |
# genes file | |
GENES_URI = ensure_filepath_from_uri(os.path.join(COS_BUCKET_URI, "genes.pkl")) | |
# genes standardization parameters | |
GENE_EXPRESSION_STANDARDIZATION_URI = ensure_filepath_from_uri( | |
os.path.join(COS_BUCKET_URI, "gene_expression_standardization.pkl") | |
) | |
# load the model | |
with open(MODEL_PARAMS_URI) as fp: | |
MODEL_PARAMS = json.load(fp) | |
MAX_LENGTH = MODEL_PARAMS["smiles_padding_length"] | |
# load SMILES language | |
SMILES_LANGUAGE = load(SMILES_LANGUAGE_URI) | |
# load gene expression | |
GENE_EXPRESSION = pd.read_csv(GENE_EXPRESSION_URI, compression="zip", low_memory=False) | |
# load genes | |
GENES = load(GENES_URI) | |
# load gene standardization parameters | |
GENE_STANDARDIZATION_PARAMETERS = load(GENE_EXPRESSION_STANDARDIZATION_URI) | |
# smiles transformations | |
SMILES_TRANSFORMS = [ | |
Canonicalization(), | |
SMILESToTokenIndexes(smiles_language=SMILES_LANGUAGE), | |
LeftPadding(padding_length=MAX_LENGTH, padding_index=SMILES_LANGUAGE.padding_index), | |
] | |
SMILES_TOKENIZE_FN = Compose(SMILES_TRANSFORMS) | |
# prepare default gene expression data | |
# NOTE: transpose and reset work around to ensure we have all needed genes | |
GENE_EXPRESSION_DATA = GENE_EXPRESSION.T.reindex(GENES).fillna(0.0).T.values | |
# NOTE: sub-selecting exisiting columns to remove all the genes | |
to_drop = list(set(GENES) & set(GENE_EXPRESSION.columns)) | |
GENE_EXPRESSION_METADATA = GENE_EXPRESSION.drop(to_drop, axis=1) | |
del GENE_EXPRESSION | |
# housekeeping | |
RESULTS_EXPIRATION_SECONDS = float( | |
os.environ.get( | |
"PACCMANN_RESULTS_EXPIRATION_SECONDS", | |
# every week | |
60 * 60 * 24 * 7, | |
) | |
) | |
# SMILES parameters | |
# TODO: think whether we should enforce canonicalization | |
CANON = { | |
"canonical": MODEL_PARAMS["canonical"], | |
"kekulize": MODEL_PARAMS["kekulize"], | |
"all_bonds_explicit": MODEL_PARAMS["all_bonds_explicit"], | |
"all_hs_explicit": MODEL_PARAMS["all_hs_explicit"], | |
"randomize": MODEL_PARAMS["randomize"], | |
"remove_bonddir": MODEL_PARAMS["remove_bonddir"], | |
"smiles_maximum_length": MODEL_PARAMS["smiles_padding_length"], | |
} | |