paccmann / configuration.py
jannisborn's picture
fix: Bugfix in SMILES canonicalization
30c86cf unverified
raw
history blame
2.86 kB
"""Configuration utils."""
import os
import json
import dill
import pandas as pd
from pytoda.transforms import Compose
from pytoda.smiles.transforms import SMILESToTokenIndexes, LeftPadding, Canonicalization
from cos import ensure_filepath_from_uri, COS_BUCKET_URI
# model files
MODEL_WEIGHTS_URI = ensure_filepath_from_uri(os.path.join(COS_BUCKET_URI, "model.pt"))
MODEL_PARAMS_URI = ensure_filepath_from_uri(os.path.join(COS_BUCKET_URI, "model.json"))
# SMILES language file
SMILES_LANGUAGE_URI = ensure_filepath_from_uri(
os.path.join(COS_BUCKET_URI, "smiles_language.pkl")
)
# gene expression file
GENE_EXPRESSION_URI = ensure_filepath_from_uri(
os.path.join(COS_BUCKET_URI, "gene_expression.csv.zip")
)
# genes file
GENES_URI = ensure_filepath_from_uri(os.path.join(COS_BUCKET_URI, "genes.pkl"))
# genes standardization parameters
GENE_EXPRESSION_STANDARDIZATION_URI = ensure_filepath_from_uri(
os.path.join(COS_BUCKET_URI, "gene_expression_standardization.pkl")
)
# load the model
with open(MODEL_PARAMS_URI) as fp:
MODEL_PARAMS = json.load(fp)
MAX_LENGTH = MODEL_PARAMS["smiles_padding_length"]
# load SMILES language
with open(SMILES_LANGUAGE_URI, "rb") as fp:
SMILES_LANGUAGE = dill.load(fp)
# load gene expression
GENE_EXPRESSION = pd.read_csv(GENE_EXPRESSION_URI, compression="zip", low_memory=False)
# load genes
with open(GENES_URI, "rb") as fp:
GENES = dill.load(fp)
# load gene standardization parameters
with open(GENE_EXPRESSION_STANDARDIZATION_URI, "rb") as fp:
GENE_STANDARDIZATION_PARAMETERS = dill.load(fp)
# smiles transformations
SMILES_TRANSFORMS = [
Canonicalization(),
SMILESToTokenIndexes(smiles_language=SMILES_LANGUAGE),
LeftPadding(padding_length=MAX_LENGTH, padding_index=SMILES_LANGUAGE.padding_index),
]
SMILES_TOKENIZE_FN = Compose(SMILES_TRANSFORMS)
# prepare default gene expression data
# NOTE: transpose and reset work around to ensure we have all needed genes
GENE_EXPRESSION_DATA = GENE_EXPRESSION.T.reindex(GENES).fillna(0.0).T.values
# NOTE: sub-selecting exisiting columns to remove all the genes
to_drop = list(set(GENES) & set(GENE_EXPRESSION.columns))
GENE_EXPRESSION_METADATA = GENE_EXPRESSION.drop(to_drop, axis=1)
del GENE_EXPRESSION
# housekeeping
RESULTS_EXPIRATION_SECONDS = float(
os.environ.get(
"PACCMANN_RESULTS_EXPIRATION_SECONDS",
# every week
60 * 60 * 24 * 7,
)
)
# SMILES parameters
# TODO: think whether we should enforce canonicalization
CANON = {
"canonical": MODEL_PARAMS["canonical"],
"kekulize": MODEL_PARAMS["kekulize"],
"all_bonds_explicit": MODEL_PARAMS["all_bonds_explicit"],
"all_hs_explicit": MODEL_PARAMS["all_hs_explicit"],
"randomize": MODEL_PARAMS["randomize"],
"remove_bonddir": MODEL_PARAMS["remove_bonddir"],
"smiles_maximum_length": MODEL_PARAMS["smiles_padding_length"],
}