File size: 2,856 Bytes
ec53722
 
 
 
 
 
30c86cf
ec53722
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30c86cf
ec53722
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
"""Configuration utils."""
import os
import json
import dill
import pandas as pd
from pytoda.transforms import Compose
from pytoda.smiles.transforms import SMILESToTokenIndexes, LeftPadding, Canonicalization
from cos import ensure_filepath_from_uri, COS_BUCKET_URI

# model files
MODEL_WEIGHTS_URI = ensure_filepath_from_uri(os.path.join(COS_BUCKET_URI, "model.pt"))
MODEL_PARAMS_URI = ensure_filepath_from_uri(os.path.join(COS_BUCKET_URI, "model.json"))
# SMILES language file
SMILES_LANGUAGE_URI = ensure_filepath_from_uri(
    os.path.join(COS_BUCKET_URI, "smiles_language.pkl")
)
# gene expression file
GENE_EXPRESSION_URI = ensure_filepath_from_uri(
    os.path.join(COS_BUCKET_URI, "gene_expression.csv.zip")
)
# genes file
GENES_URI = ensure_filepath_from_uri(os.path.join(COS_BUCKET_URI, "genes.pkl"))
# genes standardization parameters
GENE_EXPRESSION_STANDARDIZATION_URI = ensure_filepath_from_uri(
    os.path.join(COS_BUCKET_URI, "gene_expression_standardization.pkl")
)
# load the model
with open(MODEL_PARAMS_URI) as fp:
    MODEL_PARAMS = json.load(fp)
MAX_LENGTH = MODEL_PARAMS["smiles_padding_length"]
# load SMILES language
with open(SMILES_LANGUAGE_URI, "rb") as fp:
    SMILES_LANGUAGE = dill.load(fp)
# load gene expression
GENE_EXPRESSION = pd.read_csv(GENE_EXPRESSION_URI, compression="zip", low_memory=False)
# load genes
with open(GENES_URI, "rb") as fp:
    GENES = dill.load(fp)
# load gene standardization parameters
with open(GENE_EXPRESSION_STANDARDIZATION_URI, "rb") as fp:
    GENE_STANDARDIZATION_PARAMETERS = dill.load(fp)
# smiles transformations
SMILES_TRANSFORMS = [
    Canonicalization(),
    SMILESToTokenIndexes(smiles_language=SMILES_LANGUAGE),
    LeftPadding(padding_length=MAX_LENGTH, padding_index=SMILES_LANGUAGE.padding_index),
]
SMILES_TOKENIZE_FN = Compose(SMILES_TRANSFORMS)
# prepare default gene expression data
# NOTE: transpose and reset work around to ensure we have all needed genes
GENE_EXPRESSION_DATA = GENE_EXPRESSION.T.reindex(GENES).fillna(0.0).T.values
# NOTE: sub-selecting exisiting columns to remove all the genes
to_drop = list(set(GENES) & set(GENE_EXPRESSION.columns))
GENE_EXPRESSION_METADATA = GENE_EXPRESSION.drop(to_drop, axis=1)
del GENE_EXPRESSION
# housekeeping
RESULTS_EXPIRATION_SECONDS = float(
    os.environ.get(
        "PACCMANN_RESULTS_EXPIRATION_SECONDS",
        # every week
        60 * 60 * 24 * 7,
    )
)
# SMILES parameters
# TODO: think whether we should enforce canonicalization
CANON = {
    "canonical": MODEL_PARAMS["canonical"],
    "kekulize": MODEL_PARAMS["kekulize"],
    "all_bonds_explicit": MODEL_PARAMS["all_bonds_explicit"],
    "all_hs_explicit": MODEL_PARAMS["all_hs_explicit"],
    "randomize": MODEL_PARAMS["randomize"],
    "remove_bonddir": MODEL_PARAMS["remove_bonddir"],
    "smiles_maximum_length": MODEL_PARAMS["smiles_padding_length"],
}