Add smiles normalization (#11)
Browse files- Added smiles normalization (d823faeb77bf4d6ea7f0795d5cebd5ff2fb32344)
Co-authored-by: Victor Yukio Shirasuna <[email protected]>
smi-ted/inference/smi_ted_large/load.py
CHANGED
@@ -19,6 +19,12 @@ from transformers import BertTokenizer
|
|
19 |
import numpy as np
|
20 |
import pandas as pd
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
# Standard library
|
23 |
from functools import partial
|
24 |
import regex as re
|
@@ -29,6 +35,17 @@ from tqdm import tqdm
|
|
29 |
tqdm.pandas()
|
30 |
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
class MolTranBertTokenizer(BertTokenizer):
|
33 |
def __init__(self, vocab_file: str = '',
|
34 |
do_lower_case=False,
|
@@ -476,9 +493,13 @@ class Smi_ted(nn.Module):
|
|
476 |
if self.is_cuda_available:
|
477 |
self.encoder.cuda()
|
478 |
self.decoder.cuda()
|
|
|
|
|
|
|
|
|
479 |
|
480 |
# tokenizer
|
481 |
-
idx, mask = self.tokenize(smiles)
|
482 |
|
483 |
###########
|
484 |
# Encoder #
|
@@ -547,6 +568,7 @@ class Smi_ted(nn.Module):
|
|
547 |
|
548 |
# handle single str or a list of str
|
549 |
smiles = pd.Series(smiles) if isinstance(smiles, str) else pd.Series(list(smiles))
|
|
|
550 |
n_split = smiles.shape[0] // batch_size if smiles.shape[0] >= batch_size else smiles.shape[0]
|
551 |
|
552 |
# process in batches
|
|
|
19 |
import numpy as np
|
20 |
import pandas as pd
|
21 |
|
22 |
+
# Chemistry
|
23 |
+
from rdkit import Chem
|
24 |
+
from rdkit.Chem import PandasTools
|
25 |
+
from rdkit.Chem import Descriptors
|
26 |
+
PandasTools.RenderImagesInAllDataFrames(True)
|
27 |
+
|
28 |
# Standard library
|
29 |
from functools import partial
|
30 |
import regex as re
|
|
|
35 |
tqdm.pandas()
|
36 |
|
37 |
|
38 |
+
# function to canonicalize SMILES
|
39 |
+
def normalize_smiles(smi, canonical=True, isomeric=False):
|
40 |
+
try:
|
41 |
+
normalized = Chem.MolToSmiles(
|
42 |
+
Chem.MolFromSmiles(smi), canonical=canonical, isomericSmiles=isomeric
|
43 |
+
)
|
44 |
+
except:
|
45 |
+
normalized = None
|
46 |
+
return normalized
|
47 |
+
|
48 |
+
|
49 |
class MolTranBertTokenizer(BertTokenizer):
|
50 |
def __init__(self, vocab_file: str = '',
|
51 |
do_lower_case=False,
|
|
|
493 |
if self.is_cuda_available:
|
494 |
self.encoder.cuda()
|
495 |
self.decoder.cuda()
|
496 |
+
|
497 |
+
# handle single str or a list of str
|
498 |
+
smiles = pd.Series(smiles) if isinstance(smiles, str) else pd.Series(list(smiles))
|
499 |
+
smiles = smiles.apply(normalize_smiles)
|
500 |
|
501 |
# tokenizer
|
502 |
+
idx, mask = self.tokenize(smiles.to_list())
|
503 |
|
504 |
###########
|
505 |
# Encoder #
|
|
|
568 |
|
569 |
# handle single str or a list of str
|
570 |
smiles = pd.Series(smiles) if isinstance(smiles, str) else pd.Series(list(smiles))
|
571 |
+
smiles = smiles.apply(normalize_smiles)
|
572 |
n_split = smiles.shape[0] // batch_size if smiles.shape[0] >= batch_size else smiles.shape[0]
|
573 |
|
574 |
# process in batches
|
smi-ted/inference/smi_ted_light/load.py
CHANGED
@@ -19,6 +19,12 @@ from transformers import BertTokenizer
|
|
19 |
import numpy as np
|
20 |
import pandas as pd
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
# Standard library
|
23 |
from functools import partial
|
24 |
import regex as re
|
@@ -29,6 +35,17 @@ from tqdm import tqdm
|
|
29 |
tqdm.pandas()
|
30 |
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
class MolTranBertTokenizer(BertTokenizer):
|
33 |
def __init__(self, vocab_file: str = '',
|
34 |
do_lower_case=False,
|
@@ -476,9 +493,13 @@ class Smi_ted(nn.Module):
|
|
476 |
if self.is_cuda_available:
|
477 |
self.encoder.cuda()
|
478 |
self.decoder.cuda()
|
|
|
|
|
|
|
|
|
479 |
|
480 |
# tokenizer
|
481 |
-
idx, mask = self.tokenize(smiles)
|
482 |
|
483 |
###########
|
484 |
# Encoder #
|
@@ -547,6 +568,7 @@ class Smi_ted(nn.Module):
|
|
547 |
|
548 |
# handle single str or a list of str
|
549 |
smiles = pd.Series(smiles) if isinstance(smiles, str) else pd.Series(list(smiles))
|
|
|
550 |
n_split = smiles.shape[0] // batch_size if smiles.shape[0] >= batch_size else smiles.shape[0]
|
551 |
|
552 |
# process in batches
|
|
|
19 |
import numpy as np
|
20 |
import pandas as pd
|
21 |
|
22 |
+
# Chemistry
|
23 |
+
from rdkit import Chem
|
24 |
+
from rdkit.Chem import PandasTools
|
25 |
+
from rdkit.Chem import Descriptors
|
26 |
+
PandasTools.RenderImagesInAllDataFrames(True)
|
27 |
+
|
28 |
# Standard library
|
29 |
from functools import partial
|
30 |
import regex as re
|
|
|
35 |
tqdm.pandas()
|
36 |
|
37 |
|
38 |
+
# function to canonicalize SMILES
|
39 |
+
def normalize_smiles(smi, canonical=True, isomeric=False):
|
40 |
+
try:
|
41 |
+
normalized = Chem.MolToSmiles(
|
42 |
+
Chem.MolFromSmiles(smi), canonical=canonical, isomericSmiles=isomeric
|
43 |
+
)
|
44 |
+
except:
|
45 |
+
normalized = None
|
46 |
+
return normalized
|
47 |
+
|
48 |
+
|
49 |
class MolTranBertTokenizer(BertTokenizer):
|
50 |
def __init__(self, vocab_file: str = '',
|
51 |
do_lower_case=False,
|
|
|
493 |
if self.is_cuda_available:
|
494 |
self.encoder.cuda()
|
495 |
self.decoder.cuda()
|
496 |
+
|
497 |
+
# handle single str or a list of str
|
498 |
+
smiles = pd.Series(smiles) if isinstance(smiles, str) else pd.Series(list(smiles))
|
499 |
+
smiles = smiles.apply(normalize_smiles)
|
500 |
|
501 |
# tokenizer
|
502 |
+
idx, mask = self.tokenize(smiles.to_list())
|
503 |
|
504 |
###########
|
505 |
# Encoder #
|
|
|
568 |
|
569 |
# handle single str or a list of str
|
570 |
smiles = pd.Series(smiles) if isinstance(smiles, str) else pd.Series(list(smiles))
|
571 |
+
smiles = smiles.apply(normalize_smiles)
|
572 |
n_split = smiles.shape[0] // batch_size if smiles.shape[0] >= batch_size else smiles.shape[0]
|
573 |
|
574 |
# process in batches
|