paccmann / configuration.py
jannisborn's picture
refacor loading
bb77ad9 unverified
raw
history blame
2.76 kB
"""Configuration utils."""
import os
import json
import pandas as pd
from pytoda.transforms import Compose
from pytoda.smiles.transforms import SMILESToTokenIndexes, LeftPadding, Canonicalization
from cos import ensure_filepath_from_uri, COS_BUCKET_URI
from .utils import load
# model files
MODEL_WEIGHTS_URI = ensure_filepath_from_uri(os.path.join(COS_BUCKET_URI, "model.pt"))
MODEL_PARAMS_URI = ensure_filepath_from_uri(os.path.join(COS_BUCKET_URI, "model.json"))
# SMILES language file
SMILES_LANGUAGE_URI = ensure_filepath_from_uri(
os.path.join(COS_BUCKET_URI, "smiles_language.pkl")
)
# gene expression file
GENE_EXPRESSION_URI = ensure_filepath_from_uri(
os.path.join(COS_BUCKET_URI, "gene_expression.csv.zip")
)
# genes file
GENES_URI = ensure_filepath_from_uri(os.path.join(COS_BUCKET_URI, "genes.pkl"))
# genes standardization parameters
GENE_EXPRESSION_STANDARDIZATION_URI = ensure_filepath_from_uri(
os.path.join(COS_BUCKET_URI, "gene_expression_standardization.pkl")
)
# load the model
with open(MODEL_PARAMS_URI) as fp:
MODEL_PARAMS = json.load(fp)
MAX_LENGTH = MODEL_PARAMS["smiles_padding_length"]
# load SMILES language
SMILES_LANGUAGE = load(SMILES_LANGUAGE_URI)
# load gene expression
GENE_EXPRESSION = pd.read_csv(GENE_EXPRESSION_URI, compression="zip", low_memory=False)
# load genes
GENES = load(GENES_URI)
# load gene standardization parameters
GENE_STANDARDIZATION_PARAMETERS = load(GENE_EXPRESSION_STANDARDIZATION_URI)
# smiles transformations
SMILES_TRANSFORMS = [
Canonicalization(),
SMILESToTokenIndexes(smiles_language=SMILES_LANGUAGE),
LeftPadding(padding_length=MAX_LENGTH, padding_index=SMILES_LANGUAGE.padding_index),
]
SMILES_TOKENIZE_FN = Compose(SMILES_TRANSFORMS)
# prepare default gene expression data
# NOTE: transpose and reset work around to ensure we have all needed genes
GENE_EXPRESSION_DATA = GENE_EXPRESSION.T.reindex(GENES).fillna(0.0).T.values
# NOTE: sub-selecting exisiting columns to remove all the genes
to_drop = list(set(GENES) & set(GENE_EXPRESSION.columns))
GENE_EXPRESSION_METADATA = GENE_EXPRESSION.drop(to_drop, axis=1)
del GENE_EXPRESSION
# housekeeping
RESULTS_EXPIRATION_SECONDS = float(
os.environ.get(
"PACCMANN_RESULTS_EXPIRATION_SECONDS",
# every week
60 * 60 * 24 * 7,
)
)
# SMILES parameters
# TODO: think whether we should enforce canonicalization
CANON = {
"canonical": MODEL_PARAMS["canonical"],
"kekulize": MODEL_PARAMS["kekulize"],
"all_bonds_explicit": MODEL_PARAMS["all_bonds_explicit"],
"all_hs_explicit": MODEL_PARAMS["all_hs_explicit"],
"randomize": MODEL_PARAMS["randomize"],
"remove_bonddir": MODEL_PARAMS["remove_bonddir"],
"smiles_maximum_length": MODEL_PARAMS["smiles_padding_length"],
}