paccmann / configuration.py
jannisborn's picture
wip
08f739a unverified
"""Configuration utils."""
import os
import json
import pandas as pd
from pytoda.transforms import Compose
from pytoda.smiles.transforms import SMILESToTokenIndexes, LeftPadding, Canonicalization
from cos import ensure_filepath_from_uri, COS_BUCKET_URI
from utils import load
# model files
MODEL_WEIGHTS_URI = ensure_filepath_from_uri(os.path.join(COS_BUCKET_URI, "model.pt"))
MODEL_PARAMS_URI = ensure_filepath_from_uri(os.path.join(COS_BUCKET_URI, "model.json"))
# SMILES language file
SMILES_LANGUAGE_URI = ensure_filepath_from_uri(
os.path.join(COS_BUCKET_URI, "smiles_language.pkl")
)
# gene expression file
GENE_EXPRESSION_URI = ensure_filepath_from_uri(
os.path.join(COS_BUCKET_URI, "gene_expression.csv.zip")
)
# genes file
GENES_URI = ensure_filepath_from_uri(os.path.join(COS_BUCKET_URI, "genes.pkl"))
# genes standardization parameters
GENE_EXPRESSION_STANDARDIZATION_URI = ensure_filepath_from_uri(
os.path.join(COS_BUCKET_URI, "gene_expression_standardization.pkl")
)
# load the model
with open(MODEL_PARAMS_URI) as fp:
MODEL_PARAMS = json.load(fp)
MAX_LENGTH = MODEL_PARAMS["smiles_padding_length"]
# load SMILES language
SMILES_LANGUAGE = load(SMILES_LANGUAGE_URI)
# load gene expression
GENE_EXPRESSION = pd.read_csv(GENE_EXPRESSION_URI, compression="zip", low_memory=False)
# load genes
GENES = load(GENES_URI)
# load gene standardization parameters
GENE_STANDARDIZATION_PARAMETERS = load(GENE_EXPRESSION_STANDARDIZATION_URI)
# smiles transformations
SMILES_TRANSFORMS = [
Canonicalization(),
SMILESToTokenIndexes(smiles_language=SMILES_LANGUAGE),
LeftPadding(padding_length=MAX_LENGTH, padding_index=SMILES_LANGUAGE.padding_index),
]
SMILES_TOKENIZE_FN = Compose(SMILES_TRANSFORMS)
# prepare default gene expression data
# NOTE: transpose and reset work around to ensure we have all needed genes
GENE_EXPRESSION_DATA = GENE_EXPRESSION.T.reindex(GENES).fillna(0.0).T.values
# NOTE: sub-selecting exisiting columns to remove all the genes
to_drop = list(set(GENES) & set(GENE_EXPRESSION.columns))
GENE_EXPRESSION_METADATA = GENE_EXPRESSION.drop(to_drop, axis=1)
del GENE_EXPRESSION
# housekeeping
RESULTS_EXPIRATION_SECONDS = float(
os.environ.get(
"PACCMANN_RESULTS_EXPIRATION_SECONDS",
# every week
60 * 60 * 24 * 7,
)
)
# SMILES parameters
# TODO: think whether we should enforce canonicalization
CANON = {
"canonical": MODEL_PARAMS["canonical"],
"kekulize": MODEL_PARAMS["kekulize"],
"all_bonds_explicit": MODEL_PARAMS["all_bonds_explicit"],
"all_hs_explicit": MODEL_PARAMS["all_hs_explicit"],
"randomize": MODEL_PARAMS["randomize"],
"remove_bonddir": MODEL_PARAMS["remove_bonddir"],
"smiles_maximum_length": MODEL_PARAMS["smiles_padding_length"],
}