Spaces:

SynTool
/

SynTool_GUI

Sleeping

App Files Files Community

pgantzer commited on Mar 21, 2024

Commit

1760662

1 Parent(s): c08b1f5

GUI v0.1

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

SynTool/__init__.py +3 -0
SynTool/chem/__init__.py +0 -0
SynTool/chem/__pycache__/__init__.cpython-310.pyc +0 -0
SynTool/chem/__pycache__/reaction.cpython-310.pyc +0 -0
SynTool/chem/__pycache__/retron.cpython-310.pyc +0 -0
SynTool/chem/__pycache__/utils.cpython-310.pyc +0 -0
SynTool/chem/data/__init__.py +0 -0
SynTool/chem/data/__pycache__/__init__.cpython-310.pyc +0 -0
SynTool/chem/data/__pycache__/cleaning.cpython-310.pyc +0 -0
SynTool/chem/data/__pycache__/filtering.cpython-310.pyc +0 -0
SynTool/chem/data/__pycache__/mapping.cpython-310.pyc +0 -0
SynTool/chem/data/__pycache__/standardizer.cpython-310.pyc +0 -0
SynTool/chem/data/cleaning.py +124 -0
SynTool/chem/data/filtering.py +917 -0
SynTool/chem/data/mapping.py +96 -0
SynTool/chem/data/mapping.py.bk +90 -0
SynTool/chem/data/standardizer.py +604 -0
SynTool/chem/reaction.py +107 -0
SynTool/chem/reaction_rules/__init__.py +0 -0
SynTool/chem/reaction_rules/__pycache__/__init__.cpython-310.pyc +0 -0
SynTool/chem/reaction_rules/__pycache__/extraction.cpython-310.pyc +0 -0
SynTool/chem/reaction_rules/extraction.py +679 -0
SynTool/chem/reaction_rules/manual/__init__.py +6 -0
SynTool/chem/reaction_rules/manual/decompositions.py +415 -0
SynTool/chem/reaction_rules/manual/transformations.py +535 -0
SynTool/chem/retron.py +132 -0
SynTool/chem/utils.py +227 -0
SynTool/interfaces/__init__.py +0 -0
SynTool/interfaces/__pycache__/__init__.cpython-310.pyc +0 -0
SynTool/interfaces/__pycache__/visualisation.cpython-310.pyc +0 -0
SynTool/interfaces/cli.py +530 -0
SynTool/interfaces/cli.py.bk +241 -0
SynTool/interfaces/visualisation.py +346 -0
SynTool/mcts/__init__.py +7 -0
SynTool/mcts/__pycache__/__init__.cpython-310.pyc +0 -0
SynTool/mcts/__pycache__/evaluation.cpython-310.pyc +0 -0
SynTool/mcts/__pycache__/expansion.cpython-310.pyc +0 -0
SynTool/mcts/__pycache__/node.cpython-310.pyc +0 -0
SynTool/mcts/__pycache__/search.cpython-310.pyc +0 -0
SynTool/mcts/__pycache__/tree.cpython-310.pyc +0 -0
SynTool/mcts/evaluation.py +59 -0
SynTool/mcts/expansion.py +83 -0
SynTool/mcts/node.py +49 -0
SynTool/mcts/search.py +135 -0
SynTool/mcts/tree.py +659 -0
SynTool/ml/__init__.py +0 -0
SynTool/ml/__pycache__/__init__.cpython-310.pyc +0 -0
SynTool/ml/networks/__init__.py +0 -0
SynTool/ml/networks/__pycache__/__init__.cpython-310.pyc +0 -0
SynTool/ml/networks/__pycache__/modules.cpython-310.pyc +0 -0

SynTool/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .mcts import *
2	+
3	+ __all__ = ["Tree"]

SynTool/chem/__init__.py ADDED Viewed

File without changes

SynTool/chem/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (149 Bytes). View file

SynTool/chem/__pycache__/reaction.cpython-310.pyc ADDED Viewed

Binary file (3.65 kB). View file

SynTool/chem/__pycache__/retron.cpython-310.pyc ADDED Viewed

Binary file (4.88 kB). View file

SynTool/chem/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (8.25 kB). View file

SynTool/chem/data/__init__.py ADDED Viewed

File without changes

SynTool/chem/data/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (150 Bytes). View file

SynTool/chem/data/__pycache__/cleaning.cpython-310.pyc ADDED Viewed

Binary file (3.86 kB). View file

SynTool/chem/data/__pycache__/filtering.cpython-310.pyc ADDED Viewed

Binary file (27.6 kB). View file

SynTool/chem/data/__pycache__/mapping.cpython-310.pyc ADDED Viewed

Binary file (2.59 kB). View file

SynTool/chem/data/__pycache__/standardizer.cpython-310.pyc ADDED Viewed

Binary file (18 kB). View file

SynTool/chem/data/cleaning.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import os
+from multiprocessing import Queue, Process, Manager, Value
+from logging import getLogger, Logger
+from tqdm import tqdm
+from CGRtools.containers import ReactionContainer
+from .standardizer import Standardizer
+from SynTool.utils.files import ReactionReader, ReactionWriter
+from SynTool.utils.config import ReactionStandardizationConfig
+def cleaner(reaction: ReactionContainer, logger: Logger, config: ReactionStandardizationConfig):
+    """
+    Standardize a reaction according to external script
+    :param reaction: ReactionContainer to clean/standardize
+    :param logger: Logger - to avoid writing log
+    :param config: ReactionStandardizationConfig
+    :return: ReactionContainer or empty list
+    """
+    standardizer = Standardizer(id_tag='Reaction_ID',
+                                action_on_isotopes=2,
+                                skip_tautomerize=True,
+                                skip_errors=config.skip_errors,
+                                keep_unbalanced_ions=config.keep_unbalanced_ions,
+                                keep_reagents=config.keep_reagents,
+                                ignore_mapping=config.ignore_mapping,
+                                logger=logger)
+    return standardizer.standardize(reaction)
+def worker_cleaner(to_clean: Queue, to_write: Queue, config: ReactionStandardizationConfig):
+    """
+    Launches standardizations using the Queue to_clean. Fills the to_write Queue with results
+    :param to_clean: Queue of reactions to clean/standardize
+    :param to_write: Standardized outputs to write
+    :param config: ReactionStandardizationConfig
+    :return: None
+    """
+    logger = getLogger()
+    logger.disabled = True
+    while True:
+        raw_reaction = to_clean.get()
+        if raw_reaction == "Quit":
+            break
+        res = cleaner(raw_reaction, logger, config)
+        to_write.put(res)
+    logger.disabled = False
+def cleaner_writer(output_file: str, to_write: Queue, cleaned_nb: Value, remove_old=True):
+    """
+    Writes in output file the standardized reactions
+    :param output_file: output file path
+    :param to_write: Standardized ReactionContainer to write
+    :param cleaned_nb: number of final reactions
+    :param remove_old: whenever to remove or not an already existing file
+    """
+    if remove_old and os.path.isfile(output_file):
+        os.remove(output_file)
+    counter = 0
+    seen_reactions = []
+    with ReactionWriter(output_file) as out:
+        while True:
+            res = to_write.get()
+            if res:
+                if res == "Quit":
+                    cleaned_nb.set(counter)
+                    break
+                elif isinstance(res, ReactionContainer):
+                    smi = format(res, "m")
+                    if smi not in seen_reactions:
+                        out.write(res)
+                        counter += 1
+                        seen_reactions.append(smi)
+def reactions_cleaner(config: ReactionStandardizationConfig,
+                      input_file: str, output_file: str, num_cpus: int, batch_prep_size: int = 100):
+    """
+    Writes in output file the standardized reactions
+    :param config:
+    :param input_file: input RDF file path
+    :param output_file: output RDF file path
+    :param num_cpus: number of CPU to be parallelized
+    :param batch_prep_size: size of each batch per CPU
+    """
+    with Manager() as m:
+        to_clean = m.Queue(maxsize=num_cpus * batch_prep_size)
+        to_write = m.Queue(maxsize=batch_prep_size)
+        cleaned_nb = m.Value(int, 0)
+        writer = Process(target=cleaner_writer, args=(output_file, to_write, cleaned_nb))
+        writer.start()
+        workers = []
+        for _ in range(num_cpus - 2):
+            w = Process(target=worker_cleaner, args=(to_clean, to_write, config))
+            w.start()
+            workers.append(w)
+        n = 0
+        with ReactionReader(input_file) as reactions:
+            for raw_reaction in tqdm(reactions):
+                if 'Reaction_ID' not in raw_reaction.meta:
+                    raw_reaction.meta['Reaction_ID'] = n
+                to_clean.put(raw_reaction)
+                n += 1
+        for _ in workers:
+            to_clean.put("Quit")
+        for w in workers:
+            w.join()
+        to_write.put("Quit")
+        writer.join()
+        print(f'Initial number of reactions: {n}'),
+        print(f'Removed number of reactions: {n - cleaned_nb.get()}')

SynTool/chem/data/filtering.py ADDED Viewed

	@@ -0,0 +1,917 @@

+import logging
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Iterable, Tuple, Dict, Any, Optional
+from tqdm.auto import tqdm
+import numpy as np
+import ray
+import yaml
+from CGRtools.containers import ReactionContainer, MoleculeContainer, CGRContainer
+from StructureFingerprint import MorganFingerprint
+from SynTool.utils.files import ReactionReader, ReactionWriter
+from SynTool.chem.utils import remove_small_molecules, rebalance_reaction, remove_reagents
+from SynTool.utils.config import ConfigABC, convert_config_to_dict
+@dataclass
+class CompeteProductsConfig(ConfigABC):
+    fingerprint_tanimoto_threshold: float = 0.3
+    mcs_tanimoto_threshold: float = 0.6
+    @staticmethod
+    def from_dict(config_dict: Dict[str, Any]):
+        """Create an instance of CompeteProductsConfig from a dictionary."""
+        return CompeteProductsConfig(**config_dict)
+    @staticmethod
+    def from_yaml(file_path: str):
+        """Deserialize a YAML file into a CompeteProductsConfig object."""
+        with open(file_path, "r") as file:
+            config_dict = yaml.safe_load(file)
+        return CompeteProductsConfig.from_dict(config_dict)
+    def _validate_params(self, params: Dict[str, Any]):
+        """Validate configuration parameters."""
+        if not isinstance(params.get("fingerprint_tanimoto_threshold"), float) \
+                or not (0 <= params["fingerprint_tanimoto_threshold"] <= 1):
+            raise ValueError("Invalid 'fingerprint_tanimoto_threshold'; expected a float between 0 and 1")
+        if not isinstance(params.get("mcs_tanimoto_threshold"), float) \
+                or not (0 <= params["mcs_tanimoto_threshold"] <= 1):
+            raise ValueError("Invalid 'mcs_tanimoto_threshold'; expected a float between 0 and 1")
+class CompeteProductsChecker:
+    """Checks if there are compete reactions."""
+    def __init__(
+        self,
+        fingerprint_tanimoto_threshold: float = 0.3,
+        mcs_tanimoto_threshold: float = 0.6,
+    ):
+        self.fingerprint_tanimoto_threshold = fingerprint_tanimoto_threshold
+        self.mcs_tanimoto_threshold = mcs_tanimoto_threshold
+    @staticmethod
+    def from_config(config: CompeteProductsConfig):
+        """Creates an instance of CompeteProductsChecker from a configuration object."""
+        return CompeteProductsChecker(
+            config.fingerprint_tanimoto_threshold, config.mcs_tanimoto_threshold
+        )
+    def __call__(self, reaction: ReactionContainer) -> bool:
+        """
+        Returns True if the reaction has competing products, else False
+        :param reaction: input reaction
+        :return: True or False
+        """
+        mf = MorganFingerprint()
+        is_compete = False
+        # Check for compete products using both fingerprint similarity and maximum common substructure (MCS) similarity
+        for mol in reaction.reagents:
+            for other_mol in reaction.products:
+                if len(mol) > 6 and len(other_mol) > 6:
+                    # Compute fingerprint similarity
+                    molf = mf.transform([mol])
+                    other_molf = mf.transform([other_mol])
+                    fingerprint_tanimoto = tanimoto_kernel(molf, other_molf)[0][0]
+                    # If fingerprint similarity is high enough, check for MCS similarity
+                    if fingerprint_tanimoto > self.fingerprint_tanimoto_threshold:
+                        try:
+                            # Find the maximum common substructure (MCS) and compute its size
+                            clique_size = len(next(mol.get_mcs_mapping(other_mol, limit=100)))
+                            # Calculate MCS similarity based on MCS size
+                            mcs_tanimoto = clique_size / (len(mol) + len(other_mol) - clique_size)
+                            # If MCS similarity is also high enough, mark the reaction as having compete products
+                            if mcs_tanimoto > self.mcs_tanimoto_threshold:
+                                is_compete = True
+                                break
+                        except StopIteration:
+                            continue
+        return is_compete
+@dataclass
+class DynamicBondsConfig(ConfigABC):
+    min_bonds_number: int = 1
+    max_bonds_number: int = 6
+    @staticmethod
+    def from_dict(config_dict: Dict[str, Any]):
+        """Create an instance of DynamicBondsConfig from a dictionary."""
+        return DynamicBondsConfig(**config_dict)
+    @staticmethod
+    def from_yaml(file_path: str):
+        """Deserialize a YAML file into a DynamicBondsConfig object."""
+        with open(file_path, "r") as file:
+            config_dict = yaml.safe_load(file)
+        return DynamicBondsConfig.from_dict(config_dict)
+    def _validate_params(self, params: Dict[str, Any]):
+        """Validate configuration parameters."""
+        if not isinstance(params.get("min_bonds_number"), int) \
+                or params["min_bonds_number"] < 0:
+            raise ValueError(
+                "Invalid 'min_bonds_number'; expected a non-negative integer")
+        if not isinstance(params.get("max_bonds_number"), int) \
+                or params["max_bonds_number"] < 0:
+            raise ValueError("Invalid 'max_bonds_number'; expected a non-negative integer")
+        if params["min_bonds_number"] > params["max_bonds_number"]:
+            raise ValueError("'min_bonds_number' cannot be greater than 'max_bonds_number'")
+class DynamicBondsChecker:
+    """Checks if there is an unacceptable number of dynamic bonds in CGR."""
+    def __init__(self, min_bonds_number: int = 1, max_bonds_number: int = 6):
+        self.min_bonds_number = min_bonds_number
+        self.max_bonds_number = max_bonds_number
+    @staticmethod
+    def from_config(config: DynamicBondsConfig):
+        """Creates an instance of DynamicBondsChecker from a configuration object."""
+        return DynamicBondsChecker(config.min_bonds_number, config.max_bonds_number)
+    def __call__(self, reaction: ReactionContainer) -> bool:
+        cgr = ~reaction
+        return not (self.min_bonds_number <= len(cgr.center_bonds) <= self.max_bonds_number)
+@dataclass
+class SmallMoleculesConfig(ConfigABC):
+    limit: int = 6
+    @staticmethod
+    def from_dict(config_dict: Dict[str, Any]):
+        """Create an instance of SmallMoleculesConfig from a dictionary."""
+        return SmallMoleculesConfig(**config_dict)
+    @staticmethod
+    def from_yaml(file_path: str):
+        """Deserialize a YAML file into a SmallMoleculesConfig object."""
+        with open(file_path, "r") as file:
+            config_dict = yaml.safe_load(file)
+        return SmallMoleculesConfig.from_dict(config_dict)
+    def _validate_params(self, params: Dict[str, Any]):
+        """Validate configuration parameters."""
+        if not isinstance(params.get("limit"), int) or params["limit"] < 1:
+            raise ValueError("Invalid 'limit'; expected a positive integer")
+class SmallMoleculesChecker:
+    """Checks if there are only small molecules in the reaction or if there is only one small reactant or product."""
+    def __init__(self, limit: int = 6):
+        self.limit = limit
+    @staticmethod
+    def from_config(config: SmallMoleculesConfig):
+        """Creates an instance of SmallMoleculesChecker from a configuration object."""
+        return SmallMoleculesChecker(config.limit)
+    def __call__(self, reaction: ReactionContainer) -> bool:
+        if (len(reaction.reactants) == 1 and self.are_only_small_molecules(reaction.reactants)) \
+                or (len(reaction.products) == 1 and self.are_only_small_molecules(reaction.products)) \
+                or (self.are_only_small_molecules(reaction.reactants) and self.are_only_small_molecules(reaction.products)):
+            return True
+        return False
+    def are_only_small_molecules(self, molecules: Iterable[MoleculeContainer]) -> bool:
+        """Checks if all molecules in the given iterable are small molecules."""
+        return all(len(molecule) <= self.limit for molecule in molecules)
+@dataclass
+class CGRConnectedComponentsConfig:
+    pass
+class CGRConnectedComponentsChecker:
+    """Allows to check if CGR contains unrelated components (without reagents)."""
+    @staticmethod
+    def from_config(config: CGRConnectedComponentsConfig):  # TODO config class not used
+        """Creates an instance of CGRConnectedComponentsChecker from a configuration object."""
+        return CGRConnectedComponentsChecker()
+    def __call__(self, reaction: ReactionContainer) -> bool:
+        tmp_reaction = ReactionContainer(reaction.reactants, reaction.products)
+        cgr = ~tmp_reaction
+        return cgr.connected_components_count > 1
+@dataclass
+class RingsChangeConfig:
+    pass
+class RingsChangeChecker:
+    """Allows to check if there is changing rings number in the reaction."""
+    @staticmethod
+    def from_config(config: RingsChangeConfig):  # TODO config class not used
+        """Creates an instance of RingsChecker from a configuration object."""
+        return RingsChangeChecker()
+    def __call__(self, reaction: ReactionContainer):
+        """
+        Returns True if there are valence mistakes in the reaction or there is a reaction with mismatch numbers of all
+        rings or aromatic rings in reactants and products (reaction in rings)
+        :param reaction: input reaction
+        :return: True or False
+        """
+        reaction.kekule()
+        reaction.thiele()
+        r_rings, r_arom_rings = self._calc_rings(reaction.reactants)
+        p_rings, p_arom_rings = self._calc_rings(reaction.products)
+        if (r_arom_rings != p_arom_rings) or (r_rings != p_rings):
+            return True
+        else:
+            return False
+    @staticmethod
+    def _calc_rings(molecules: Iterable) -> Tuple[int, int]:
+        """
+        Calculates number of all rings and number of aromatic rings in molecules
+        :param molecules: set of molecules
+        :return: number of all rings and number of aromatic rings in molecules
+        """
+        rings, arom_rings = 0, 0
+        for mol in molecules:
+            rings += mol.rings_count
+            arom_rings += len(mol.aromatic_rings)
+        return rings, arom_rings
+@dataclass
+class StrangeCarbonsConfig:
+    # Currently empty, but can be extended in the future if needed
+    pass
+class StrangeCarbonsChecker:
+    """Checks if there are 'strange' carbons in the reaction."""
+    @staticmethod
+    def from_config(config: StrangeCarbonsConfig):  # TODO config class not used
+        """Creates an instance of StrangeCarbonsChecker from a configuration object."""
+        return StrangeCarbonsChecker()
+    def __call__(self, reaction: ReactionContainer) -> bool:
+        for molecule in reaction.reactants + reaction.products:
+            atoms_types = {a.atomic_symbol for _, a in molecule.atoms()}  # atoms types in molecule
+            if len(atoms_types) == 1 and atoms_types.pop() == "C":
+                if len(molecule) == 1:  # methane
+                    return True
+                bond_types = {int(b) for _, _, b in molecule.bonds()}
+                if len(bond_types) == 1 and bond_types.pop() != 4:
+                    return True  # C molecules with only one type of bond (not aromatic)
+        return False
+@dataclass
+class NoReactionConfig:
+    # Currently empty, but can be extended in the future if needed
+    pass
+class NoReactionChecker:
+    """Checks if there is no reaction in the provided reaction container."""
+    @staticmethod
+    def from_config(config: NoReactionConfig):  # TODO config class not used
+        """Creates an instance of NoReactionChecker from a configuration object."""
+        return NoReactionChecker()
+    def __call__(self, reaction: ReactionContainer) -> bool:
+        cgr = ~reaction
+        return not cgr.center_atoms and not cgr.center_bonds
+@dataclass
+class MultiCenterConfig:
+    pass
+class MultiCenterChecker:
+    """Checks if there is a multicenter reaction."""
+    @staticmethod
+    def from_config(config: MultiCenterConfig):  # TODO config class not used
+        return MultiCenterChecker()
+    def __call__(self, reaction: ReactionContainer) -> bool:
+        cgr = ~reaction
+        return len(cgr.centers_list) > 1
+@dataclass
+class WrongCHBreakingConfig:
+    pass
+class WrongCHBreakingChecker:
+    """Checks for incorrect C-C bond formation from breaking a C-H bond."""
+    @staticmethod
+    def from_config(config: WrongCHBreakingConfig):  # TODO config class not used
+        return WrongCHBreakingChecker()
+    def __call__(self, reaction: ReactionContainer) -> bool:
+        """
+        Determines if a reaction involves incorrect C-C bond formation from breaking a C-H bond.
+        :param reaction: The reaction to be checked.
+        :return: True if incorrect C-C bond formation is found, False otherwise.
+        """
+        reaction.kekule()
+        if reaction.check_valence():
+            return False
+        reaction.thiele()
+        copy_reaction = reaction.copy()
+        copy_reaction.explicify_hydrogens()
+        cgr = ~copy_reaction
+        reduced_cgr = cgr.augmented_substructure(cgr.center_atoms, deep=1)
+        return self.is_wrong_c_h_breaking(reduced_cgr)
+    @staticmethod
+    def is_wrong_c_h_breaking(cgr: CGRContainer) -> bool:
+        """
+        Checks for incorrect C-C bond formation from breaking a C-H bond in a CGR.
+        :param cgr: The CGR with explicified hydrogens.
+        :return: True if incorrect C-C bond formation is found, False otherwise.
+        """
+        for atom_id in cgr.center_atoms:
+            if cgr.atom(atom_id).atomic_symbol == "C":
+                is_c_h_breaking, is_c_c_formation = False, False
+                c_with_h_id, another_c_id = None, None
+                for neighbour_id, bond in cgr._bonds[atom_id].items():
+                    neighbour = cgr.atom(neighbour_id)
+                    if (
+                        bond.order
+                        and not bond.p_order
+                        and neighbour.atomic_symbol == "H"
+                    ):
+                        is_c_h_breaking = True
+                        c_with_h_id = atom_id
+                    elif (
+                        not bond.order
+                        and bond.p_order
+                        and neighbour.atomic_symbol == "C"
+                    ):
+                        is_c_c_formation = True
+                        another_c_id = neighbour_id
+                if is_c_h_breaking and is_c_c_formation:
+                    # Check for presence of heteroatoms in the first environment of 2 bonding carbons
+                    if any(
+                        cgr.atom(neighbour_id).atomic_symbol not in ("C", "H")
+                        for neighbour_id in cgr._bonds[c_with_h_id]
+                    ) or any(
+                        cgr.atom(neighbour_id).atomic_symbol not in ("C", "H")
+                        for neighbour_id in cgr._bonds[another_c_id]
+                    ):
+                        return False
+                    return True
+        return False
+@dataclass
+class CCsp3BreakingConfig:
+    pass
+class CCsp3BreakingChecker:
+    """Checks if there is C(sp3)-C bond breaking."""
+    @staticmethod
+    def from_config(config: CCsp3BreakingConfig):  # TODO config class not used
+        return CCsp3BreakingChecker()
+    def __call__(self, reaction: ReactionContainer) -> bool:
+        """
+        Returns True if there is C(sp3)-C bonds breaking, else False
+        :param reaction: input reaction
+        :return: True or False
+        """
+        cgr = ~reaction
+        reaction_center = cgr.augmented_substructure(cgr.center_atoms, deep=1)
+        for atom_id, neighbour_id, bond in reaction_center.bonds():
+            atom = reaction_center.atom(atom_id)
+            neighbour = reaction_center.atom(neighbour_id)
+            is_bond_broken = bond.order is not None and bond.p_order is None
+            are_atoms_carbons = (
+                atom.atomic_symbol == "C" and neighbour.atomic_symbol == "C"
+            )
+            is_atom_sp3 = atom.hybridization == 1 or neighbour.hybridization == 1
+            if is_bond_broken and are_atoms_carbons and is_atom_sp3:
+                return True
+        return False
+@dataclass
+class CCRingBreakingConfig:
+    pass
+class CCRingBreakingChecker:
+    """Checks if a reaction involves ring C-C bond breaking."""
+    @staticmethod
+    def from_config(config: CCRingBreakingConfig):  # TODO config class not used
+        return CCRingBreakingChecker()
+    def __call__(self, reaction: ReactionContainer) -> bool:
+        """
+        Returns True if the reaction involves ring C-C bond breaking, else False
+        :param reaction: input reaction
+        :return: True or False
+        """
+        cgr = ~reaction
+        # Extract reactants' center atoms and their rings
+        reactants_center_atoms = {}
+        reactants_rings = set()
+        for reactant in reaction.reactants:
+            reactants_rings.update(reactant.sssr)
+            for n, atom in reactant.atoms():
+                if n in cgr.center_atoms:
+                    reactants_center_atoms[n] = atom
+        # Identify reaction center based on center atoms
+        reaction_center = cgr.augmented_substructure(atoms=cgr.center_atoms, deep=0)
+        # Iterate over bonds in the reaction center and check for ring C-C bond breaking
+        for atom_id, neighbour_id, bond in reaction_center.bonds():
+            try:
+                # Retrieve corresponding atoms from reactants
+                atom = reactants_center_atoms[atom_id]
+                neighbour = reactants_center_atoms[neighbour_id]
+            except KeyError:
+                continue
+            else:
+                # Check if the bond is broken and both atoms are carbons in rings of size 5, 6, or 7
+                is_bond_broken = (bond.order is not None) and (bond.p_order is None)
+                are_atoms_carbons = (
+                    atom.atomic_symbol == "C" and neighbour.atomic_symbol == "C"
+                )
+                are_atoms_in_ring = (
+                    set(atom.ring_sizes).intersection({5, 6, 7})
+                    and set(neighbour.ring_sizes).intersection({5, 6, 7})
+                    and any(
+                        atom_id in ring and neighbour_id in ring
+                        for ring in reactants_rings
+                    )
+                )
+                # If all conditions are met, indicate ring C-C bond breaking
+                if is_bond_broken and are_atoms_carbons and are_atoms_in_ring:
+                    return True
+        return False
+@dataclass
+class ReactionCheckConfig(ConfigABC):
+    """
+    Configuration class for reaction checks, inheriting from ConfigABC.
+    This class manages configuration settings for various reaction checkers, including paths, file formats,
+    and checker-specific parameters.
+    Attributes:
+        dynamic_bonds_config: Configuration for dynamic bonds checking.
+        small_molecules_config: Configuration for small molecules checking.
+        strange_carbons_config: Configuration for strange carbons checking.
+        compete_products_config: Configuration for competing products checking.
+        cgr_connected_components_config: Configuration for CGR connected components checking.
+        rings_change_config: Configuration for rings change checking.
+        no_reaction_config: Configuration for no reaction checking.
+        multi_center_config: Configuration for multi-center checking.
+        wrong_ch_breaking_config: Configuration for wrong C-H breaking checking.
+        cc_sp3_breaking_config: Configuration for CC sp3 breaking checking.
+        cc_ring_breaking_config: Configuration for CC ring breaking checking.
+    """
+    # Configuration for reaction checkers
+    dynamic_bonds_config: Optional[DynamicBondsConfig] = None
+    small_molecules_config: Optional[SmallMoleculesConfig] = None
+    strange_carbons_config: Optional[StrangeCarbonsConfig] = None
+    compete_products_config: Optional[CompeteProductsConfig] = None
+    cgr_connected_components_config: Optional[CGRConnectedComponentsConfig] = None
+    rings_change_config: Optional[RingsChangeConfig] = None
+    no_reaction_config: Optional[NoReactionConfig] = None
+    multi_center_config: Optional[MultiCenterConfig] = None
+    wrong_ch_breaking_config: Optional[WrongCHBreakingConfig] = None
+    cc_sp3_breaking_config: Optional[CCsp3BreakingConfig] = None
+    cc_ring_breaking_config: Optional[CCRingBreakingConfig] = None
+    # Other configuration parameters
+    rebalance_reaction: bool = False
+    remove_reagents: bool = True
+    reagents_max_size: int = 7
+    remove_small_molecules: bool = False
+    small_molecules_max_size: int = 6
+    def to_dict(self):
+        """
+        Converts the configuration into a dictionary.
+        """
+        config_dict = {
+            "dynamic_bonds_config": convert_config_to_dict(
+                self.dynamic_bonds_config, DynamicBondsConfig
+            ),
+            "small_molecules_config": convert_config_to_dict(
+                self.small_molecules_config, SmallMoleculesConfig
+            ),
+            "compete_products_config": convert_config_to_dict(
+                self.compete_products_config, CompeteProductsConfig
+            ),
+            "cgr_connected_components_config": {}
+            if self.cgr_connected_components_config is not None
+            else None,
+            "rings_change_config": {} if self.rings_change_config is not None else None,
+            "strange_carbons_config": {}
+            if self.strange_carbons_config is not None
+            else None,
+            "no_reaction_config": {} if self.no_reaction_config is not None else None,
+            "multi_center_config": {} if self.multi_center_config is not None else None,
+            "wrong_ch_breaking_config": {}
+            if self.wrong_ch_breaking_config is not None
+            else None,
+            "cc_sp3_breaking_config": {}
+            if self.cc_sp3_breaking_config is not None
+            else None,
+            "cc_ring_breaking_config": {}
+            if self.cc_ring_breaking_config is not None
+            else None,
+            "rebalance_reaction": self.rebalance_reaction,
+            "remove_reagents": self.remove_reagents,
+            "reagents_max_size": self.reagents_max_size,
+            "remove_small_molecules": self.remove_small_molecules,
+            "small_molecules_max_size": self.small_molecules_max_size,
+        }
+        filtered_config_dict = {k: v for k, v in config_dict.items() if v is not None}
+        return filtered_config_dict
+    @staticmethod
+    def from_dict(config_dict: Dict[str, Any]):
+        """
+        Create an instance of ReactionCheckConfig from a dictionary.
+        """
+        # Instantiate configuration objects if their corresponding dictionary is present
+        dynamic_bonds_config = (
+            DynamicBondsConfig(**config_dict["dynamic_bonds_config"])
+            if "dynamic_bonds_config" in config_dict
+            else None
+        )
+        small_molecules_config = (
+            SmallMoleculesConfig(**config_dict["small_molecules_config"])
+            if "small_molecules_config" in config_dict
+            else None
+        )
+        compete_products_config = (
+            CompeteProductsConfig(**config_dict["compete_products_config"])
+            if "compete_products_config" in config_dict
+            else None
+        )
+        cgr_connected_components_config = (
+            CGRConnectedComponentsConfig()
+            if "cgr_connected_components_config" in config_dict
+            else None
+        )
+        rings_change_config = (
+            RingsChangeConfig()
+            if "rings_change_config" in config_dict
+            else None
+        )
+        strange_carbons_config = (
+            StrangeCarbonsConfig()
+            if "strange_carbons_config" in config_dict
+            else None
+        )
+        no_reaction_config = (
+            NoReactionConfig()
+            if "no_reaction_config" in config_dict
+            else None
+        )
+        multi_center_config = (
+            MultiCenterConfig()
+            if "multi_center_config" in config_dict
+            else None
+        )
+        wrong_ch_breaking_config = (
+            WrongCHBreakingConfig()
+            if "wrong_ch_breaking_config" in config_dict
+            else None
+        )
+        cc_sp3_breaking_config = (
+            CCsp3BreakingConfig()
+            if "cc_sp3_breaking_config" in config_dict
+            else None
+        )
+        cc_ring_breaking_config = (
+            CCRingBreakingConfig()
+            if "cc_ring_breaking_config" in config_dict
+            else None
+        )
+        # Extract other simple configuration parameters
+        rebalance_reaction = config_dict.get("rebalance_reaction", False)
+        remove_reagents = config_dict.get("remove_reagents", True)
+        reagents_max_size = config_dict.get("reagents_max_size", 7)
+        remove_small_molecules = config_dict.get("remove_small_molecules", False)
+        small_molecules_max_size = config_dict.get("small_molecules_max_size", 6)
+        return ReactionCheckConfig(
+            dynamic_bonds_config=dynamic_bonds_config,
+            small_molecules_config=small_molecules_config,
+            compete_products_config=compete_products_config,
+            cgr_connected_components_config=cgr_connected_components_config,
+            rings_change_config=rings_change_config,
+            strange_carbons_config=strange_carbons_config,
+            no_reaction_config=no_reaction_config,
+            multi_center_config=multi_center_config,
+            wrong_ch_breaking_config=wrong_ch_breaking_config,
+            cc_sp3_breaking_config=cc_sp3_breaking_config,
+            cc_ring_breaking_config=cc_ring_breaking_config,
+            rebalance_reaction=rebalance_reaction,
+            remove_reagents=remove_reagents,
+            reagents_max_size=reagents_max_size,
+            remove_small_molecules=remove_small_molecules,
+            small_molecules_max_size=small_molecules_max_size,
+        )
+    @staticmethod
+    def from_yaml(file_path):
+        """
+        Deserializes a YAML file into a ReactionCheckConfig object.
+        """
+        with open(file_path, "r") as file:
+            config_dict = yaml.safe_load(file)
+        return ReactionCheckConfig.from_dict(config_dict)
+    def _validate_params(self, params: Dict[str, Any]):
+        if not isinstance(params["rebalance_reaction"], bool):
+            raise ValueError("rebalance_reaction must be a boolean.")
+        if not isinstance(params["remove_reagents"], bool):
+            raise ValueError("remove_reagents must be a boolean.")
+        if not isinstance(params["reagents_max_size"], int):
+            raise ValueError("reagents_max_size must be an int.")
+        if not isinstance(params["remove_small_molecules"], bool):
+            raise ValueError("remove_small_molecules must be a boolean.")
+        if not isinstance(params["small_molecules_max_size"], int):
+            raise ValueError("small_molecules_max_size must be an int.")
+    def create_checkers(self):
+        checker_instances = []
+        if self.dynamic_bonds_config is not None:
+            checker_instances.append(
+                DynamicBondsChecker.from_config(self.dynamic_bonds_config)
+            )
+        if self.small_molecules_config is not None:
+            checker_instances.append(
+                SmallMoleculesChecker.from_config(self.small_molecules_config)
+            )
+        if self.strange_carbons_config is not None:
+            checker_instances.append(
+                StrangeCarbonsChecker.from_config(self.strange_carbons_config)
+            )
+        if self.compete_products_config is not None:
+            checker_instances.append(
+                CompeteProductsChecker.from_config(self.compete_products_config)
+            )
+        if self.cgr_connected_components_config is not None:
+            checker_instances.append(
+                CGRConnectedComponentsChecker.from_config(
+                    self.cgr_connected_components_config
+                )
+            )
+        if self.rings_change_config is not None:
+            checker_instances.append(
+                RingsChangeChecker.from_config(self.rings_change_config)
+            )
+        if self.no_reaction_config is not None:
+            checker_instances.append(
+                NoReactionChecker.from_config(self.no_reaction_config)
+            )
+        if self.multi_center_config is not None:
+            checker_instances.append(
+                MultiCenterChecker.from_config(self.multi_center_config)
+            )
+        if self.wrong_ch_breaking_config is not None:
+            checker_instances.append(
+                WrongCHBreakingChecker.from_config(self.wrong_ch_breaking_config)
+            )
+        if self.cc_sp3_breaking_config is not None:
+            checker_instances.append(
+                CCsp3BreakingChecker.from_config(self.cc_sp3_breaking_config)
+            )
+        if self.cc_ring_breaking_config is not None:
+            checker_instances.append(
+                CCRingBreakingChecker.from_config(self.cc_ring_breaking_config)
+            )
+        return checker_instances
+def tanimoto_kernel(x, y):
+    """
+    Calculate the Tanimoto coefficient between each element of arrays x and y.
+    """
+    x = x.astype(np.float64)
+    y = y.astype(np.float64)
+    x_dot = np.dot(x, y.T)
+    x2 = np.sum(x**2, axis=1)
+    y2 = np.sum(y**2, axis=1)
+    denominator = np.array([x2] * len(y2)).T + np.array([y2] * len(x2)) - x_dot
+    result = np.divide(x_dot, denominator, out=np.zeros_like(x_dot), where=denominator != 0)
+    return result
+def remove_file_if_exists(directory: Path, file_names):  # TODO not used
+    for file_name in file_names:
+        file_path = directory / file_name
+        if file_path.is_file():
+            file_path.unlink()
+            logging.warning(f"Removed {file_path}")
+def filter_reaction(reaction: ReactionContainer, config: ReactionCheckConfig, checkers: list):
+    is_filtered = False
+    if config.remove_small_molecules:
+        new_reaction = remove_small_molecules(reaction, number_of_atoms=config.small_molecules_max_size)
+    else:
+        new_reaction = reaction.copy()
+    if new_reaction is None:
+        is_filtered = True
+    if config.remove_reagents and not is_filtered:
+        new_reaction = remove_reagents(
+            new_reaction,
+            keep_reagents=True,
+            reagents_max_size=config.reagents_max_size,
+        )
+    if new_reaction is None:
+        is_filtered = True
+        new_reaction = reaction.copy()
+        # TODO you are specifying that if the reaction has only reagents, it is kept as it ?
+    if not is_filtered:
+        if config.rebalance_reaction:
+            new_reaction = rebalance_reaction(new_reaction)
+        for checker in checkers:
+            try: # TODO CGRTools: ValueError: mapping of graphs is not disjoint
+                if checker(new_reaction):
+                    # If checker returns True it means the reaction doesn't pass the check
+                    new_reaction.meta["filtration_log"] = checker.__class__.__name__
+                    is_filtered = True
+            except:
+                is_filtered = True
+    return is_filtered, new_reaction
+@ray.remote
+def process_batch(batch, config: ReactionCheckConfig, checkers):
+    results = []
+    for index, reaction in batch:
+        try: # TODO CGRtools.exceptions.MappingError: atoms with number {52} not equal
+            is_filtered, processed_reaction = filter_reaction(reaction, config, checkers)
+            results.append((index, is_filtered, processed_reaction))
+        except:
+            results.append((index, True, reaction))
+    return results
+def process_completed_batches(futures, result_file, pbar, treated: int = 0, passed_filters: int = 0):
+    done, _ = ray.wait(list(futures.keys()), num_returns=1)
+    completed_batch = ray.get(done[0])
+    # Write results of the completed batch to file
+    now_treated = 0
+    for index, is_filtered, reaction in completed_batch:
+        now_treated += 1
+        if not is_filtered:
+            result_file.write(reaction.meta['init_smiles'])
+            passed_filters += 1
+    # Remove completed future and update progress bar
+    del futures[done[0]]
+    pbar.update(now_treated)
+    treated += now_treated
+    return treated, passed_filters
+def filter_reactions(
+    config: ReactionCheckConfig,
+    reaction_database_path: str,
+    result_reactions_file_name: str = "reaction_data_filtered.smi",
+    append_results: bool = False,
+    num_cpus: int = 1,
+    batch_size: int = 100,
+) -> None:
+    """
+    Processes a database of chemical reactions, applying checks based on the provided configuration,
+    and writes the results to specified files. All configurations are provided by the ReactionCheckConfig object.
+    :param config: ReactionCheckConfig object containing all configuration settings.
+    :param reaction_database_path: Path to the reaction database file.
+    :param result_reactions_file_name: Name for the file containing cleaned reactions.
+    :param append_results: Flag indicating whether to append results to existing files.
+    :param num_cpus: Number of CPUs to use for processing.
+    :param batch_size: Size of the batch for processing reactions.
+    :return: None. The function writes the processed reactions to specified RDF and pickle files.
+             Unique reactions are written if save_only_unique is True.
+    """
+    checkers = config.create_checkers()
+    ray.init(num_cpus=num_cpus, ignore_reinit_error=True, logging_level=logging.ERROR)
+    max_concurrent_batches = num_cpus  # Limit the number of concurrent batches
+    with ReactionReader(reaction_database_path) as reactions, \
+         ReactionWriter(result_reactions_file_name, append_results) as result_file:
+        pbar = tqdm(reactions, leave=True)  # TODO fix progress bars
+        futures = {}
+        batch = []
+        treated = filtered = 0
+        for index, reaction in enumerate(reactions):
+            reaction.meta["reaction_index"] = index
+            batch.append((index, reaction))
+            if len(batch) == batch_size:
+                future = process_batch.remote(batch, config, checkers)
+                futures[future] = None
+                batch = []
+                # Check and process completed tasks if we've reached the concurrency limit
+                while len(futures) >= max_concurrent_batches:
+                    treated, filtered = process_completed_batches(futures, result_file, pbar, treated, filtered)
+        # Process the last batch if it's not empty
+        if batch:
+            future = process_batch.remote(batch, config, checkers)
+            futures[future] = None
+        # Process remaining batches
+        while futures:
+            treated, filtered = process_completed_batches(futures, result_file, pbar, treated, filtered)
+        pbar.close()
+    ray.shutdown()
+    print(f'Initial number of reactions: {treated}'),
+    print(f'Removed number of reactions: {treated - filtered}')

SynTool/chem/data/mapping.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from pathlib import Path
+from os.path import splitext
+from typing import Union
+from tqdm import tqdm
+from chython import smiles, RDFRead, RDFWrite, ReactionContainer
+from chython.exceptions import MappingError, IncorrectSmiles
+from SynTool.utils import path_type
+def remove_reagents_and_map(rea: ReactionContainer, keep_reagent: bool = False) -> Union[ReactionContainer, None]:
+    """
+    Maps atoms of the reaction using chytorch.
+    :param rea: reaction to map
+    :type rea: ReactionContainer
+    :param keep_reagent: whenever to remove reagent or not
+    :type keep_reagent: bool
+    :return: ReactionContainer or None
+    """
+    try:
+        rea.reset_mapping()
+    except MappingError:
+        rea.reset_mapping()  # Successive reset_mapping works
+    if not keep_reagent:
+        try:
+            rea.remove_reagents()
+        except:
+            return None
+    return rea
+def remove_reagents_and_map_from_file(input_file: path_type, output_file: path_type, keep_reagent: bool = False) -> None:
+    """
+    Reads a file of reactions and maps atoms of the reactions using chytorch.
+    :param input_file: the path and name of the input file
+    :type input_file: path_type
+    :param output_file: the path and name of the output file
+    :type output_file: path_type
+    :param keep_reagent: whenever to remove reagent or not
+    :type keep_reagent: bool
+    :return: None
+    """
+    input_file = str(Path(input_file).resolve(strict=True))
+    _, input_ext = splitext(input_file)
+    if input_ext == ".smi":
+        input_file = open(input_file, "r")
+    elif input_ext == ".rdf":
+        input_file = RDFRead(input_file, indexable=True)
+    else:
+        raise ValueError("File extension not recognized. File:", input_file,
+                         "- Please use smi or rdf file")
+    enumerator = input_file if input_ext == ".rdf" else input_file.readlines()
+    _, out_ext = splitext(output_file)
+    if out_ext == ".smi":
+        output_file = open(output_file, "w")
+    elif out_ext == ".rdf":
+        output_file = RDFWrite(output_file)
+    else:
+        raise ValueError("File extension not recognized. File:", output_file,
+                         "- Please use smi or rdf file")
+    mapping_errors = 0
+    parsing_errors = 0
+    for rea_raw in tqdm(enumerator):
+        try:
+            rea = smiles(rea_raw.strip('\n')) if input_ext == ".smi" else rea_raw
+        except IncorrectSmiles:
+            parsing_errors += 1
+            continue
+        try:
+            rea_mapped = remove_reagents_and_map(rea, keep_reagent)
+        except MappingError:
+            try:
+                rea_mapped = remove_reagents_and_map(smiles(str(rea)), keep_reagent)
+            except MappingError:
+                mapping_errors += 1
+                continue
+        if rea_mapped:
+            rea_output = format(rea, "m") + "\n" if out_ext == ".smi" else rea
+            output_file.write(rea_output)
+        else:
+            mapping_errors += 1
+    input_file.close()
+    output_file.close()
+    if parsing_errors:
+        print(parsing_errors, "reactions couldn't be parsed")
+    if mapping_errors:
+        print(mapping_errors, "reactions couldn't be mapped")

SynTool/chem/data/mapping.py.bk ADDED Viewed

	@@ -0,0 +1,90 @@

+from pathlib import Path
+from os.path import splitext
+from typing import Union
+from tqdm import tqdm
+from chython import smiles, RDFRead, RDFWrite, ReactionContainer
+from chython.exceptions import MappingError
+from Syntool.utils import path_type
+def remove_reagents_and_map(rea: ReactionContainer) -> Union[ReactionContainer, None]:
+    """
+    Maps atoms of the reaction using chytorch.
+    :param rea: reaction to map
+    :type rea: ReactionContainer
+    :return: ReactionContainer or None
+    """
+    try:
+        rea.reset_mapping()
+    except MappingError:
+        rea.reset_mapping()
+    try:
+        rea.remove_reagents()
+        return rea
+    except:
+        # print("Error", str(rea))
+        return None
+def remove_reagents_and_map_from_file(input_file: path_type, output_file: path_type) -> None:
+    """
+    Reads a file of reactions and maps atoms of the reactions using chytorch.
+    :param input_file: the path and name of the input file
+    :type input_file: path_type
+    :param output_file: the path and name of the output file
+    :type output_file: path_type
+    :return: None
+    """
+    input_file = str(Path(input_file).resolve(strict=True))
+    _, input_ext = splitext(input_file)
+    if input_ext == ".smi":
+        input_file = open(input_file, "r")
+    elif input_ext == ".rdf":
+        input_file = RDFRead(input_file, indexable=True)
+    else:
+        raise ValueError("File extension not recognized. File:", input_file,
+                         "- Please use smi or rdf file")
+    enumerator = input_file if input_ext == ".rdf" else input_file.readlines()
+    _, out_ext = splitext(output_file)
+    if out_ext == ".smi":
+        output_file = open(output_file, "w")
+    elif out_ext == ".rdf":
+        output_file = RDFWrite(output_file)
+    else:
+        raise ValueError("File extension not recognized. File:", output_file,
+                         "- Please use smi or rdf file")
+    mapping_errors = 0
+    parsing_errors = 0
+    for rea_raw in tqdm(enumerator):
+        try:
+            rea = smiles(rea_raw.strip('\n')) if input_ext == ".smi" else rea_raw
+        except:
+            parsing_errors += 1
+            print("Error", parsing_errors, rea_raw)
+            continue
+        try:
+            rea_mapped = remove_reagents_and_map(rea)
+        except:
+            parsing_errors += 1
+            print("Error for,", rea)
+            continue
+        if rea_mapped:
+            rea_output = format(rea, "m") + "\n" if out_ext == ".smi" else rea
+            output_file.write(rea_output)
+        else:
+            mapping_errors += 1
+    input_file.close()
+    output_file.close()
+    if mapping_errors:
+        print(mapping_errors, "reactions couldn't be mapped")

SynTool/chem/data/standardizer.py ADDED Viewed

	@@ -0,0 +1,604 @@

+#############################################################################
+# Code issued from https://github.com/Laboratoire-de-Chemoinformatique/Reaction_Data_Cleaning
+# Reaction_Data_Cleaning/scripts/standardizer.py
+# version as it from commit 793475e54d8b2c7f714165a61e4eb439435d7d92
+# DOI 10.1002/minf.202100119
+#############################################################################
+# Chemical reactions data curation best practices
+# including optimized RDTool
+#############################################################################
+# GNU LGPL https://www.gnu.org/licenses/lgpl-3.0.en.html
+#############################################################################
+# Corresponding Authors: Timur Madzhidov and Alexandre Varnek
+# Corresponding Authors' emails: [email protected] and [email protected]
+# Main contributors: Arkadii Lin, Natalia Duybankova, Ramil Nugmanov, Rail Suleymanov and Timur Madzhidov
+# Copyright: Copyright 2020,
+#            MaDeSmart, Machine Design of Small Molecules by AI
+#            VLAIO project HBC.2018.2287
+# Credits: Kazan Federal University, Russia
+#          University of Strasbourg, France
+#          University of Linz, Austria
+#          University of Leuven, Belgium
+#          Janssen Pharmaceutica N.V., Beerse, Belgium
+#          Rail Suleymanov, Arcadia, St. Petersburg, Russia
+# License: GNU LGPL https://www.gnu.org/licenses/lgpl-3.0.en.html
+# Version: 00.02
+#############################################################################
+from CGRtools.files import RDFRead, RDFWrite, SDFWrite, SDFRead, SMILESRead
+from CGRtools.containers import MoleculeContainer, ReactionContainer
+import logging
+from ordered_set import OrderedSet
+import os
+import io
+import pathlib
+from pathlib import PurePosixPath
+class Standardizer:
+    def __init__(self, skip_errors=False, log_file=None, keep_unbalanced_ions=False, id_tag='Reaction_ID',
+                 action_on_isotopes=0, keep_reagents=False, logger=None, ignore_mapping=False, jvm_path=None,
+                 rdkit_dearomatization=False, remove_unchanged_parts=True, skip_tautomerize=True,
+                 jchem_path=None, add_reagents_to_reactants=False) -> None:
+        if logger is None:
+            self.logger = self._config_log(log_file, logger_name='logger')
+        else:
+            self.logger = logger
+        self._skip_errors = skip_errors
+        self._keep_unbalanced_ions = keep_unbalanced_ions
+        self._id_tag = id_tag
+        self._action_on_isotopes = action_on_isotopes
+        self._keep_reagents = keep_reagents
+        self._ignore_mapping = ignore_mapping
+        self._remove_unchanged_parts_flag = remove_unchanged_parts
+        self._skip_tautomerize = skip_tautomerize
+        self._dearomatize_by_rdkit = rdkit_dearomatization
+        self._reagents_to_reactants = add_reagents_to_reactants
+        if not skip_tautomerize:
+            if jvm_path:
+                os.environ['JDK_HOME'] = jvm_path
+                os.environ['JAVA_HOME'] = jvm_path
+                os.environ['PATH'] += f';{PurePosixPath(jvm_path).joinpath("bin").joinpath("server")};' \
+                                      f'{PurePosixPath(jvm_path).joinpath("bin").joinpath("server")};'
+            if jchem_path:
+                import jnius_config
+                jnius_config.add_classpath(jchem_path)
+            from jnius import autoclass
+            Standardizer = autoclass('chemaxon.standardizer.Standardizer')
+            self._Molecule = autoclass('chemaxon.struc.Molecule')
+            self._MolHandler = autoclass('chemaxon.util.MolHandler')
+            self._standardizer = Standardizer('tautomerize')
+    def standardize_file(self, input_file=None) -> OrderedSet:
+        """
+        Standardize a set of reactions in a file. Returns an ordered set of ReactionContainer objects passed the
+        standardization protocol.
+        :param input_file: str
+        :return: OrderedSet
+        """
+        if pathlib.Path(input_file).suffix == '.rdf':
+            data = self._read_RDF(input_file)
+        elif pathlib.Path(input_file).suffix == '.smi' or pathlib.Path(input_file).suffix == '.smiles':
+            data = self._read_SMILES(input_file)
+        else:
+            raise ValueError('Data format is not recognized!')
+        print("{0} reactions passed..".format(len(data)))
+        return data
+    def _read_RDF(self, input_file) -> OrderedSet:
+        """
+        Reads an RDF file. Returns an ordered set of ReactionContainer objects passed the standardization protocol.
+        :param input_file: str
+        :return: OrderedSet
+        """
+        data = OrderedSet()
+        self.logger.info('Start..')
+        with RDFRead(input_file, ignore=self._ignore_mapping, store_log=True, remap=self._ignore_mapping) as ifile, \
+                open(input_file) as meta_searcher:
+            for reaction in ifile._data:
+                if isinstance(reaction, tuple):
+                    meta_searcher.seek(reaction.position)
+                    flag = False
+                    for line in meta_searcher:
+                        if flag and '$RFMT' in line:
+                            self.logger.critical(f'Reaction id extraction problem rised for the reaction '
+                                                 f'#{reaction.number + 1}: a reaction id was expected but $RFMT line '
+                                                 f'was found!')
+                        if flag:
+                            self.logger.critical(f'Reaction {line.strip().split()[1]}: Parser has returned an error '
+                                                 f'message\n{reaction.log}')
+                            break
+                        elif '$RFMT' in line:
+                            self.logger.critical(f'Reaction #{reaction.number + 1} has no reaction id!')
+                        elif f'$DTYPE {self._id_tag}' in line:
+                            flag = True
+                    continue
+                standardized_reaction = self.standardize(reaction)
+                if standardized_reaction:
+                    if standardized_reaction not in data:
+                        data.add(standardized_reaction)
+                    else:
+                        i = data.index(standardized_reaction)
+                        if 'Extraction_IDs' not in data[i].meta:
+                            data[i].meta['Extraction_IDs'] = ''
+                        data[i].meta['Extraction_IDs'] = ','.join(data[i].meta['Extraction_IDs'].split(',') +
+                                                                  [reaction.meta[self._id_tag]])
+                        self.logger.info('Reaction {0} is a duplicate of the reaction {1}..'
+                                         .format(reaction.meta[self._id_tag], data[i].meta[self._id_tag]))
+        return data
+    def _read_SMILES(self, input_file) -> OrderedSet:
+        """
+        Reads a SMILES file. Returns an ordered set of ReactionContainer objects passed the standardization protocol.
+        :param input_file: str
+        :return: OrderedSet
+        """
+        data = OrderedSet()
+        self.logger.info('Start..')
+        with SMILESRead(input_file, ignore=True, store_log=True, remap=self._ignore_mapping, header=True) as ifile, \
+                open(input_file) as meta_searcher:
+            id_tag_position = meta_searcher.readline().strip().split().index(self._id_tag)
+            if id_tag_position is None or id_tag_position == 0:
+                self.logger.critical(f'No reaction ID tag was found in the header!')
+                raise ValueError(f'No reaction ID tag was found in the header!')
+            for reaction in ifile._data:
+                if isinstance(reaction, tuple):
+                    meta_searcher.seek(reaction.position)
+                    line = meta_searcher.readline().strip().split()
+                    if len(line) <= id_tag_position:
+                        self.logger.critical(f'No reaction ID tag was found in line {reaction.number}!')
+                        raise ValueError(f'No reaction ID tag was found in line {reaction.number}!')
+                    r_id = line[id_tag_position]
+                    self.logger.critical(f'Reaction {r_id}: Parser has returned an error message\n{reaction.log}')
+                    continue
+                standardized_reaction = self.standardize(reaction)
+                if standardized_reaction:
+                    if standardized_reaction not in data:
+                        data.add(standardized_reaction)
+                    else:
+                        i = data.index(standardized_reaction)
+                        if 'Extraction_IDs' not in data[i].meta:
+                            data[i].meta['Extraction_IDs'] = ''
+                        data[i].meta['Extraction_IDs'] = ','.join(data[i].meta['Extraction_IDs'].split(',') +
+                                                                  [reaction.meta[self._id_tag]])
+                        self.logger.info('Reaction {0} is a duplicate of the reaction {1}..'
+                                         .format(reaction.meta[self._id_tag], data[i].meta[self._id_tag]))
+        return data
+    def standardize(self, reaction: ReactionContainer) -> ReactionContainer:
+        """
+        Standardization protocol: transform functional groups, kekulize, remove explicit hydrogens,
+        check for radicals (remove if something was found), check for isotopes, regroup ions (if the total charge
+        of reactants and/or products is not zero, and the 'keep_unbalanced_ions' option is False which is by default,
+        such reactions are removed; if the 'keep_unbalanced_ions' option is set True, they are kept), check valences
+        (remove if something is wrong), aromatize (thiele method), fix mapping (for symmetric functional groups) if
+        such is in, remove unchanged parts.
+        :param reaction: ReactionContainer
+        :return: ReactionContainer
+        """
+        self.logger.info('Reaction {0}..'.format(reaction.meta[self._id_tag]))
+        try:
+            reaction.standardize()
+        except:
+            self.logger.exception(
+                'Reaction {0}: Cannot standardize functional groups..'.format(reaction.meta[self._id_tag]))
+            if not self._skip_errors:
+                raise Exception(
+                    'Reaction {0}: Cannot standardize functional groups..'.format(reaction.meta[self._id_tag]))
+            else:
+                return
+        try:
+            reaction.kekule()
+        except:
+            self.logger.exception('Reaction {0}: Cannot kekulize..'.format(reaction.meta[self._id_tag]))
+            if not self._skip_errors:
+                raise Exception('Reaction {0}: Cannot kekulize..'.format(reaction.meta[self._id_tag]))
+            else:
+                return
+        try:
+            if self._check_valence(reaction):
+                self.logger.info(
+                    'Reaction {0}: Bad valence: {1}'.format(reaction.meta[self._id_tag], reaction.meta['mistake']))
+                return
+        except:
+            self.logger.exception('Reaction {0}: Cannot check valence..'.format(reaction.meta[self._id_tag]))
+            if not self._skip_errors:
+                self.logger.critical('Stop the algorithm!')
+                raise Exception('Reaction {0}: Cannot check valence..'.format(reaction.meta[self._id_tag]))
+            else:
+                return
+        try:
+            if not self._skip_tautomerize:
+                reaction = self._tautomerize(reaction)
+        except:
+            self.logger.exception('Reaction {0}: Cannot tautomerize..'.format(reaction.meta[self._id_tag]))
+            if not self._skip_errors:
+                raise Exception('Reaction {0}: Cannot tautomerize..'.format(reaction.meta[self._id_tag]))
+            else:
+                return
+        try:
+            reaction.implicify_hydrogens()
+        except:
+            self.logger.exception(
+                'Reaction {0}: Cannot remove explicit hydrogens..'.format(reaction.meta[self._id_tag]))
+            if not self._skip_errors:
+                raise Exception('Reaction {0}: Cannot remove explicit hydrogens..'.format(reaction.meta[self._id_tag]))
+            else:
+                return
+        try:
+            if self._check_radicals(reaction):
+                self.logger.info('Reaction {0}: Radicals were found..'.format(reaction.meta[self._id_tag]))
+                return
+        except:
+            self.logger.exception('Reaction {0}: Cannot check radicals..'.format(reaction.meta[self._id_tag]))
+            if not self._skip_errors:
+                raise Exception('Reaction {0}: Cannot check radicals..'.format(reaction.meta[self._id_tag]))
+            else:
+                return
+        try:
+            if self._action_on_isotopes == 1 and self._check_isotopes(reaction):
+                self.logger.info('Reaction {0}: Isotopes were found..'.format(reaction.meta[self._id_tag]))
+                return
+            elif self._action_on_isotopes == 2 and self._check_isotopes(reaction):
+                reaction.clean_isotopes()
+                self.logger.info('Reaction {0}: Isotopes were removed but the reaction was kept..'.format(
+                    reaction.meta[self._id_tag]))
+        except:
+            self.logger.exception('Reaction {0}: Cannot check for isotopes..'.format(reaction.meta[self._id_tag]))
+            if not self._skip_errors:
+                raise Exception('Reaction {0}: Cannot check for isotopes..'.format(reaction.meta[self._id_tag]))
+            else:
+                return
+        try:
+            reaction, return_code = self._split_ions(reaction)
+            if return_code == 1:
+                self.logger.info('Reaction {0}: Ions were split..'.format(reaction.meta[self._id_tag]))
+            elif return_code == 2:
+                self.logger.info('Reaction {0}: Ions were split but the reaction is imbalanced..'.format(
+                    reaction.meta[self._id_tag]))
+                if not self._keep_unbalanced_ions:
+                    return
+        except:
+            self.logger.exception('Reaction {0}: Cannot group ions..'.format(reaction.meta[self._id_tag]))
+            if not self._skip_errors:
+                raise Exception('Reaction {0}: Cannot group ions..'.format(reaction.meta[self._id_tag]))
+            else:
+                return
+        try:
+            reaction.thiele()
+        except:
+            self.logger.exception('Reaction {0}: Cannot aromatize..'.format(reaction.meta[self._id_tag]))
+            if not self._skip_errors:
+                raise Exception('Reaction {0}: Cannot aromatize..'.format(reaction.meta[self._id_tag]))
+            else:
+                return
+        try:
+            reaction.fix_mapping()
+        except:
+            self.logger.exception('Reaction {0}: Cannot fix mapping..'.format(reaction.meta[self._id_tag]))
+            if not self._skip_errors:
+                raise Exception('Reaction {0}: Cannot fix mapping..'.format(reaction.meta[self._id_tag]))
+            else:
+                return
+        try:
+            if self._remove_unchanged_parts_flag:
+                reaction = self._remove_unchanged_parts(reaction)
+                if not reaction.reactants and reaction.products:
+                    self.logger.info('Reaction {0}: Reactants are empty..'.format(reaction.meta[self._id_tag]))
+                    return
+                if not reaction.products and reaction.reactants:
+                    self.logger.info('Reaction {0}: Products are empty..'.format(reaction.meta[self._id_tag]))
+                    return
+                if not reaction.reactants and not reaction.products:
+                    self.logger.exception(
+                        'Reaction {0}: Cannot remove unchanged parts or the reaction is empty..'.format(
+                            reaction.meta[self._id_tag]))
+                    return
+        except:
+            self.logger.exception('Reaction {0}: Cannot remove unchanged parts or the reaction is empty..'.format(
+                reaction.meta[self._id_tag]))
+            if not self._skip_errors:
+                raise Exception('Reaction {0}: Cannot remove unchanged parts or the reaction is empty..'.format(
+                    reaction.meta[self._id_tag]))
+            else:
+                return
+        self.logger.debug('Reaction {0} is done..'.format(reaction.meta[self._id_tag]))
+        return reaction
+    def write(self, output_file: str, data: OrderedSet) -> None:
+        """
+        Dump a set of reactions.
+        :param data: OrderedSet
+        :param output_file: str
+        :return: None
+        """
+        with RDFWrite(output_file) as out:
+            for r in data:
+                out.write(r)
+    def _check_valence(self, reaction: ReactionContainer) -> bool:
+        """
+        Checks valences.
+        :param reaction: ReactionContainer
+        :return: bool
+        """
+        mistakes = []
+        for molecule in (reaction.reactants + reaction.products + reaction.reagents):
+            valence_mistakes = molecule.check_valence()
+            if valence_mistakes:
+                mistakes.append(("|".join([str(num) for num in valence_mistakes]),
+                                          "|".join([str(molecule.atom(n)) for n in valence_mistakes]), str(molecule)))
+        if mistakes:
+            message = ",".join([f'{atom_nums} at {atoms} in {smiles}' for atom_nums, atoms, smiles in mistakes])
+            reaction.meta['mistake'] = f'Valence mistake: {message}'
+            return True
+        return False
+    def _config_log(self, log_file: str, logger_name: str):
+        logger = logging.getLogger(logger_name)
+        logger.setLevel(logging.DEBUG)
+        formatter = logging.Formatter(fmt='%(asctime)s: %(message)s', datefmt='%d/%m/%Y %H:%M:%S')
+        logger.handlers.clear()
+        fileHandler = logging.FileHandler(filename=log_file, mode='w')
+        fileHandler.setFormatter(formatter)
+        fileHandler.setLevel(logging.DEBUG)
+        logger.addHandler(fileHandler)
+        # logging.basicConfig(filename=log_file, level=logging.info, filemode='w', format='%(asctime)s: %(message)s',
+        #                     datefmt='%d/%m/%Y %H:%M:%S')
+        return logger
+    def _check_radicals(self, reaction: ReactionContainer) -> bool:
+        """
+        Checks radicals.
+        :param reaction: ReactionContainer
+        :return: bool
+        """
+        for molecule in (reaction.reactants + reaction.products + reaction.reagents):
+            for n, atom in molecule.atoms():
+                if atom.is_radical:
+                    return True
+        return False
+    def _calc_charge(self, molecule: MoleculeContainer) -> int:
+        """Computing charge of molecule.
+        :param: molecule: MoleculeContainer
+        :return: int
+        """
+        return sum(molecule._charges.values())
+    def _group_ions(self, reaction: ReactionContainer):
+        """
+        Ungroup molecules recorded as ions, regroup ions. Returns a tuple with the corresponding ReactionContainer and
+        return code as int (0 - nothing was changed, 1 - ions were regrouped, 2 - ions are unbalanced).
+        :param reaction: current reaction
+        :return: tuple[ReactionContainer, int]
+        """
+        meta = reaction.meta
+        reaction_parts = []
+        return_codes = []
+        for molecules in (reaction.reactants, reaction.reagents, reaction.products):
+            divided_molecules = [x for m in molecules for x in m.split('.')]
+            if len(divided_molecules) == 0:
+                reaction_parts.append(())
+                continue
+            elif len(divided_molecules) == 1 and self._calc_charge(divided_molecules[0]) == 0:
+                return_codes.append(0)
+                reaction_parts.append(molecules)
+                continue
+            elif len(divided_molecules) == 1:
+                return_codes.append(2)
+                reaction_parts.append(molecules)
+                continue
+            new_molecules = []
+            cations, anions, ions = [], [], []
+            total_charge = 0
+            for molecule in divided_molecules:
+                mol_charge = self._calc_charge(molecule)
+                total_charge += mol_charge
+                if mol_charge == 0:
+                    new_molecules.append(molecule)
+                elif mol_charge > 0:
+                    cations.append((mol_charge, molecule))
+                    ions.append((mol_charge, molecule))
+                else:
+                    anions.append((mol_charge, molecule))
+                    ions.append((mol_charge, molecule))
+            if len(cations) == 0 and len(anions) == 0:
+                return_codes.append(0)
+                reaction_parts.append(tuple(new_molecules))
+                continue
+            elif total_charge != 0:
+                return_codes.append(2)
+                reaction_parts.append(tuple(divided_molecules))
+                continue
+            else:
+                salt = MoleculeContainer()
+                for ion_charge, ion in ions:
+                    salt = salt.union(ion)
+                    total_charge += ion_charge
+                    if total_charge == 0:
+                        new_molecules.append(salt)
+                        salt = MoleculeContainer()
+                if total_charge != 0:
+                    new_molecules.append(salt)
+                    return_codes.append(2)
+                    reaction_parts.append(tuple(new_molecules))
+                else:
+                    return_codes.append(1)
+                    reaction_parts.append(tuple(new_molecules))
+        return ReactionContainer(reactants=reaction_parts[0], reagents=reaction_parts[1], products=reaction_parts[2],
+                                 meta=meta), max(return_codes)
+    def _split_ions(self, reaction: ReactionContainer):
+        """
+        Split ions in a reaction. Returns a tuple with the corresponding ReactionContainer and
+        a return code as int (0 - nothing was changed, 1 - ions were split, 2 - ions were split but the reaction
+        is imbalanced).
+        :param reaction: current reaction
+        :return: tuple[ReactionContainer, int]
+        """
+        meta = reaction.meta
+        reaction_parts = []
+        return_codes = []
+        for molecules in (reaction.reactants, reaction.reagents, reaction.products):
+            divided_molecules = [x for m in molecules for x in m.split('.')]
+            total_charge = 0
+            ions_present = False
+            for molecule in divided_molecules:
+                mol_charge = self._calc_charge(molecule)
+                total_charge += mol_charge
+                if mol_charge != 0:
+                    ions_present = True
+            if ions_present and total_charge:
+                return_codes.append(2)
+            elif ions_present:
+                return_codes.append(1)
+            else:
+                return_codes.append(0)
+            reaction_parts.append(tuple(divided_molecules))
+        return ReactionContainer(reactants=reaction_parts[0], reagents=reaction_parts[1], products=reaction_parts[2],
+                                 meta=meta), max(return_codes)
+    def _remove_unchanged_parts(self, reaction: ReactionContainer) -> ReactionContainer:
+        """
+        Ungroup molecules, remove unchanged parts from reactants and products.
+        :param reaction: current reaction
+        :return: ReactionContainer
+        """
+        meta = reaction.meta
+        new_reactants = [m for m in reaction.reactants]
+        new_reagents = [m for m in reaction.reagents]
+        if self._reagents_to_reactants:
+            new_reactants.extend(new_reagents)
+            new_reagents = []
+        reactants = new_reactants.copy()
+        new_products = [m for m in reaction.products]
+        for reactant in reactants:
+            if reactant in new_products:
+                new_reagents.append(reactant)
+                new_reactants.remove(reactant)
+                new_products.remove(reactant)
+        if not self._keep_reagents:
+            new_reagents = []
+        return ReactionContainer(reactants=tuple(new_reactants), reagents=tuple(new_reagents),
+                                 products=tuple(new_products), meta=meta)
+    def _check_isotopes(self, reaction: ReactionContainer) -> bool:
+        for molecules in (reaction.reactants, reaction.products):
+            for molecule in molecules:
+                for _, atom in molecule.atoms():
+                    if atom.isotope:
+                        return True
+        return False
+    def _tautomerize(self, reaction: ReactionContainer) -> ReactionContainer:
+        """
+        Perform ChemAxon tautomerization.
+        :param reaction: reaction that needs to be tautomerized
+        :return: ReactionContainer
+        """
+        new_molecules = []
+        for part in [reaction.reactants, reaction.reagents, reaction.products]:
+            tmp = []
+            for mol in part:
+                with io.StringIO() as f, SDFWrite(f) as i:
+                    i.write(mol)
+                    sdf = f.getvalue()
+                mol_handler = self._MolHandler(sdf)
+                mol_handler.clean(True, '2')
+                molecule = mol_handler.getMolecule()
+                self._standardizer.standardize(molecule)
+                new_mol_handler = self._MolHandler(molecule)
+                new_sdf = new_mol_handler.toFormat('SDF')
+                with io.StringIO('\n  ' + new_sdf.strip()) as f, SDFRead(f, remap=False) as i:
+                    new_mol = next(i)
+                tmp.append(new_mol)
+            new_molecules.append(tmp)
+        return ReactionContainer(reactants=tuple(new_molecules[0]), reagents=tuple(new_molecules[1]),
+                                 products=tuple(new_molecules[2]), meta=reaction.meta)
+    # def _dearomatize_by_RDKit(self, reaction: ReactionContainer) -> ReactionContainer:
+    #     """
+    #     Dearomatizes by RDKit (needs in case of some mappers, such as RXNMapper).
+    #     :param reaction: ReactionContainer
+    #     :return: ReactionContainer
+    #     """
+    #     with io.StringIO() as f, RDFWrite(f) as i:
+    #         i.write(reaction)
+    #         s = '\n'.join(f.getvalue().split('\n')[3:])
+    #         rxn = rdChemReactions.ReactionFromRxnBlock(s)
+    #         reactants, reagents, products = [], [], []
+    #         for mol in rxn.GetReactants():
+    #             try:
+    #                 Chem.SanitizeMol(mol, Chem.SanitizeFlags.SANITIZE_KEKULIZE, catchErrors=True)
+    #             except Chem.rdchem.KekulizeException:
+    #                 return reaction
+    #             with io.StringIO(Chem.MolToMolBlock(mol)) as f2, SDFRead(f2, remap=False) as sdf_i:
+    #                 reactants.append(next(sdf_i))
+    #         for mol in rxn.GetAgents():
+    #             try:
+    #                 Chem.SanitizeMol(mol, Chem.SanitizeFlags.SANITIZE_KEKULIZE, catchErrors=True)
+    #             except Chem.rdchem.KekulizeException:
+    #                 return reaction
+    #             with io.StringIO(Chem.MolToMolBlock(mol)) as f2, SDFRead(f2, remap=False) as sdf_i:
+    #                 reagents.append(next(sdf_i))
+    #         for mol in rxn.GetProducts():
+    #             try:
+    #                 Chem.SanitizeMol(mol, Chem.SanitizeFlags.SANITIZE_KEKULIZE, catchErrors=True)
+    #             except Chem.rdchem.KekulizeException:
+    #                 return reaction
+    #             with io.StringIO(Chem.MolToMolBlock(mol)) as f2, SDFRead(f2, remap=False) as sdf_i:
+    #                 products.append(next(sdf_i))
+    #
+    #     new_reaction = ReactionContainer(reactants=tuple(reactants), reagents=tuple(reagents), products=tuple(products),
+    #                                      meta=reaction.meta)
+    #
+    #     return new_reaction
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser(description="This is a tool for reaction standardization.",
+                                     epilog="Arkadii Lin, Strasbourg/Kazan 2020", prog="Standardizer")
+    parser.add_argument("-i", "--input", type=str, help="Input RDF file.")
+    parser.add_argument("-o", "--output", type=str, help="Output RDF file.")
+    parser.add_argument("-id", "--idTag", default='Reaction_ID', type=str, help="ID tag in the RDF file.")
+    parser.add_argument("--skipErrors", action="store_true", help="Skip errors.")
+    parser.add_argument("--keep_unbalanced_ions", action="store_true", help="Will keep reactions with unbalanced ions.")
+    parser.add_argument("--action_on_isotopes", type=int, default=0, help="Action performed if an isotope is "
+                                                                          "found: 0 - to ignore isotopes; "
+                                                                          "1 - to remove reactions with isotopes; "
+                                                                          "2 - to clear isotopes' labels.")
+    parser.add_argument("--keep_reagents", action="store_true", help="Will keep reagents from the reaction.")
+    parser.add_argument("--add_reagents", action="store_true", help="Will add the given reagents to reactants.")
+    parser.add_argument("--ignore_mapping", action="store_true", help="Will ignore the initial mapping in the file.")
+    parser.add_argument("--keep_unchanged_parts", action="store_true", help="Will keep unchanged parts in a reaction.")
+    parser.add_argument("--logFile", type=str, default='logFile.txt', help="Log file name.")
+    parser.add_argument("--skip_tautomerize", action="store_true", help="Will skip generation of the major tautomer.")
+    parser.add_argument("--rdkit_dearomatization", action="store_true", help="Will kekulize the reaction using RDKit "
+                                                                             "facilities.")
+    parser.add_argument("--jvm_path", type=str,
+                        help="JVM path (e.g. C:\\Program Files\\Java\\jdk-13.0.2).")
+    parser.add_argument("--jchem_path", type=str, help="JChem path (e.g. C:\\Users\\user\\JChemSuite\\lib\\jchem.jar).")
+    args = parser.parse_args()
+    standardizer = Standardizer(skip_errors=args.skipErrors, log_file=args.logFile,
+                                keep_unbalanced_ions=args.keep_unbalanced_ions, id_tag=args.idTag,
+                                action_on_isotopes=args.action_on_isotopes, keep_reagents=args.keep_reagents,
+                                ignore_mapping=args.ignore_mapping, skip_tautomerize=args.skip_tautomerize,
+                                remove_unchanged_parts=(not args.keep_unchanged_parts), jvm_path=args.jvm_path,
+                                jchem_path=args.jchem_path, rdkit_dearomatization=args.rdkit_dearomatization,
+                                add_reagents_to_reactants=args.add_reagents)
+    data = standardizer.standardize_file(input_file=args.input)
+    standardizer.write(output_file=args.output, data=data)

SynTool/chem/reaction.py ADDED Viewed

	@@ -0,0 +1,107 @@

+"""
+Module containing classes and functions for manipulating reactions and reaction rules
+"""
+from CGRtools.reactor import Reactor
+from CGRtools.containers import MoleculeContainer, ReactionContainer
+from CGRtools.exceptions import InvalidAromaticRing
+class Reaction(ReactionContainer):
+    """
+    Reaction class can be used for a general representation of reaction for different chemoinformatics Python packages
+    """
+    def __init__(self, *args, **kwargs):
+        """
+        Initializes the reaction object.
+        """
+        super().__init__(*args, **kwargs)
+def add_small_mols(big_mol, small_molecules=None):
+    """
+    The function takes a molecule and returns a list of modified molecules where each small molecule has been added to
+    the big molecule.
+    :param big_mol: A molecule
+    :param small_molecules: A list of small molecules that need to be added to the molecule
+    :return: Returns a list of molecules.
+    """
+    if small_molecules:
+        tmp_mol = big_mol.copy()
+        transition_mapping = {}
+        for small_mol in small_molecules:
+            for n, atom in small_mol.atoms():
+                new_number = tmp_mol.add_atom(atom.atomic_symbol)
+                transition_mapping[n] = new_number
+            for atom, neighbor, bond in small_mol.bonds():
+                tmp_mol.add_bond(transition_mapping[atom], transition_mapping[neighbor], bond)
+            transition_mapping = {}
+        return tmp_mol.split()
+    else:
+        return [big_mol]
+def apply_reaction_rule(
+        molecule: MoleculeContainer,
+        reaction_rule: Reactor,
+        sort_reactions: bool = False,
+        top_reactions_num: int = 3,
+        validate_products: bool = True,
+        rebuild_with_cgr: bool = False,
+) -> list[MoleculeContainer]:
+    """
+    The function applies a reaction rule to a given molecule.
+    :param rebuild_with_cgr:
+    :param validate_products:
+    :param sort_reactions:
+    :param top_reactions_num:
+    :param molecule: A MoleculeContainer object representing the molecule on which the reaction rule will be applied
+    :type molecule: MoleculeContainer
+    :param reaction_rule: The reaction_rule is an instance of the Reactor class. It represents a reaction rule that
+    can be applied to a molecule
+    :type reaction_rule: Reactor
+    """
+    reactants = add_small_mols(molecule, small_molecules=False)
+    try:
+        if sort_reactions:
+            unsorted_reactions = list(reaction_rule(reactants))
+            sorted_reactions = sorted(
+                unsorted_reactions,
+                key=lambda react: len(list(filter(lambda mol: len(mol) > 6, react.products))),
+                reverse=True
+            )
+            reactions = sorted_reactions[:top_reactions_num]  # Take top-N reactions from reactor
+        else:
+            reactions = []
+            for reaction in reaction_rule(reactants):
+                reactions.append(reaction)
+                if len(reactions) == top_reactions_num:
+                    break
+    except IndexError:
+        reactions = []
+    for reaction in reactions:
+        if rebuild_with_cgr:
+            cgr = reaction.compose()
+            products = cgr.decompose()[1].split()
+        else:
+            products = reaction.products
+        products = [mol for mol in products if len(mol) > 0]
+        if validate_products:
+            for molecule in products:
+                try:
+                    molecule.kekule()
+                    if molecule.check_valence():
+                        yield None
+                    molecule.thiele()
+                except InvalidAromaticRing:
+                    yield None
+        yield products

SynTool/chem/reaction_rules/__init__.py ADDED Viewed

File without changes

SynTool/chem/reaction_rules/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (160 Bytes). View file

SynTool/chem/reaction_rules/__pycache__/extraction.cpython-310.pyc ADDED Viewed

Binary file (25.7 kB). View file

SynTool/chem/reaction_rules/extraction.py ADDED Viewed

	@@ -0,0 +1,679 @@

+"""
+Module containing functions with fixed protocol for reaction rules extraction
+"""
+import logging
+import pickle
+from collections import defaultdict
+from itertools import islice
+from pathlib import Path
+from typing import List, Union, Tuple, IO, Dict, Set, Iterable, Any
+from os.path import splitext
+import ray
+from CGRtools.containers import MoleculeContainer, QueryContainer, ReactionContainer
+from CGRtools.exceptions import InvalidAromaticRing
+from CGRtools.reactor import Reactor
+from tqdm.auto import tqdm
+from SynTool.chem.utils import reverse_reaction
+from SynTool.utils.config import RuleExtractionConfig
+from SynTool.utils.files import ReactionReader
+def extract_rules_from_reactions(
+    config: RuleExtractionConfig,
+    reaction_file: str,
+    rules_file_name: str = 'reaction_rules.pickle',
+    num_cpus: int = 1,
+    batch_size: int = 10,
+) -> None:
+    """
+    Extracts reaction rules from a set of reactions based on the given configuration.
+    This function initializes a Ray environment for distributed computing and processes each reaction
+    in the provided reaction database to extract reaction rules. It handles the reactions in batches,
+    parallelize the rule extraction process. Extracted rules are written to RDF files and their statistics
+    are recorded. The function also sorts the rules based on their popularity and saves the sorted rules.
+    :param config: Configuration settings for rule extraction, including file paths, batch size, and other parameters.
+    :param reaction_file: Path to the file containing reaction database.
+    :param rules_file_name: Name of the file to store the extracted rules.
+    :param num_cpus: Number of CPU cores to use for processing. Defaults to 1.
+    :param batch_size: Number of reactions to process in each batch. Defaults to 10.
+    :return: None
+    """
+    # read files
+    reaction_file = Path(reaction_file).resolve(strict=True)
+    ray.init(num_cpus=num_cpus, ignore_reinit_error=True, logging_level=logging.ERROR)
+    rules_file_name, _ = splitext(rules_file_name)
+    with ReactionReader(reaction_file) as reactions:
+        pbar = tqdm(reactions, disable=False)
+        futures = {}
+        batch = []
+        max_concurrent_batches = num_cpus
+        extracted_rules_and_statistics = defaultdict(list)
+        for index, reaction in enumerate(reactions):
+            batch.append((index, reaction))
+            if len(batch) == batch_size:
+                future = process_reaction_batch.remote(batch, config)
+                futures[future] = None
+                batch = []
+                while len(futures) >= max_concurrent_batches:
+                    process_completed_batches(futures, extracted_rules_and_statistics, pbar, batch_size)
+        if batch:
+            remaining_size = len(batch)
+            future = process_reaction_batch.remote(batch, config)
+            futures[future] = None
+        while futures:
+            process_completed_batches(futures, extracted_rules_and_statistics, pbar, remaining_size)
+        pbar.close()
+        sorted_rules = sort_rules(
+            extracted_rules_and_statistics,
+            min_popularity=config.min_popularity,
+            single_reactant_only=config.single_reactant_only,
+        )
+        with open(f"{rules_file_name}.pickle", "wb") as statistics_file:
+            pickle.dump(sorted_rules, statistics_file)
+            print(f'Number of extracted reaction rules: {len(sorted_rules)}')
+    ray.shutdown()
+@ray.remote
+def process_reaction_batch(
+    batch: List[Tuple[int, ReactionContainer]], config: RuleExtractionConfig
+) -> list[tuple[int, list[ReactionContainer]]]:
+    """
+    Processes a batch of reactions to extract reaction rules based on the given configuration.
+    This function operates as a remote task in a distributed system using Ray. It takes a batch of reactions,
+    where each reaction is paired with an index. For each reaction in the batch, it extracts reaction rules
+    as specified by the configuration object. The extracted rules for each reaction are then returned along
+    with the corresponding index.
+    :param batch: A list where each element is a tuple containing an index (int) and a ReactionContainer object.
+                  The index is typically used to keep track of the reaction's position in a larger dataset.
+    :type batch: List[Tuple[int, ReactionContainer]]
+    :param config: An instance of ExtractRuleConfig that provides settings and parameters for the rule extraction process.
+    :type config: RuleExtractionConfig
+    :return: A list where each element is a tuple. The first element of the tuple is an index (int), and the second
+             is a list of ReactionContainer objects representing the extracted rules for the corresponding reaction.
+    :rtype: list[tuple[int, list[ReactionContainer]]]
+    This function is intended to be used in a distributed manner with Ray to parallelize the rule extraction
+    process across multiple reactions.
+    """
+    processed_batch = []
+    for index, reaction in batch:
+        try:
+            extracted_rules = extract_rules(config, reaction)
+            processed_batch.append((index, extracted_rules))
+        except:
+            continue
+    return processed_batch
+def process_completed_batches(
+    futures: dict,
+    rules_statistics: Dict[ReactionContainer, List[int]],
+    pbar: tqdm,
+    batch_size: int,
+) -> None:
+    """
+    Processes completed batches of reactions, updating the rules statistics and writing rules to a file.
+    This function waits for the completion of a batch of reactions processed in parallel (using Ray),
+    updates the statistics for each extracted rule, and writes the rules to a result file if they are new.
+    It also updates the progress bar with the size of the processed batch.
+    :param futures: A dictionary of futures representing ongoing batch processing tasks.
+    :type futures: dict
+    :param rules_statistics: A dictionary to keep track of statistics for each rule.
+    :type rules_statistics: Dict[ReactionContainer, List[int]]
+    :param pbar: A tqdm progress bar instance for updating the progress of batch processing.
+    :type pbar: tqdm
+    :param batch_size: The number of reactions processed in each batch.
+    :type batch_size: int
+    :return: None
+    """
+    done, _ = ray.wait(list(futures.keys()), num_returns=1)
+    completed_batch = ray.get(done[0])
+    for index, extracted_rules in completed_batch:
+        for rule in extracted_rules:
+            prev_stats_len = len(rules_statistics)
+            rules_statistics[rule].append(index)
+            if len(rules_statistics) != prev_stats_len:
+                rule.meta["first_reaction_index"] = index
+    del futures[done[0]]
+    pbar.update(batch_size)
+def extract_rules(
+    config: RuleExtractionConfig, reaction: ReactionContainer
+) -> list[ReactionContainer]:
+    """
+    Extracts reaction rules from a given reaction based on the specified configuration.
+    :param config: An instance of ExtractRuleConfig, which contains various configuration settings
+                   for rule extraction, such as whether to include multicenter rules, functional groups,
+                   ring structures, leaving and incoming groups, etc.
+    :param reaction: The reaction object (ReactionContainer) from which to extract rules. The reaction
+                     object represents a chemical reaction with specified reactants, products, and possibly reagents.
+    :return: A list of ReactionContainer objects, each representing a distinct reaction rule. If
+             config.multicenter_rules is True, a single rule encompassing all reaction centers is returned.
+             Otherwise, separate rules for each reaction center are extracted, up to a maximum of 15 distinct centers.
+    """
+    if config.multicenter_rules:
+        # Extract a single rule encompassing all reaction centers
+        return [create_rule(config, reaction)]
+    else:
+        # Extract separate rules for each distinct reaction center
+        distinct_rules = set()
+        for center_reaction in islice(reaction.enumerate_centers(), 15):
+            single_rule = create_rule(config, center_reaction)
+            distinct_rules.add(single_rule)
+        return list(distinct_rules)
+def create_rule(
+    config: RuleExtractionConfig, reaction: ReactionContainer
+) -> ReactionContainer:
+    """
+    Creates a reaction rule from a given reaction based on the specified configuration.
+    :param config: An instance of ExtractRuleConfig, containing various settings that determine how
+                   the rule is created, such as environmental atom count, inclusion of functional groups,
+                   rings, leaving and incoming groups, and other parameters.
+    :param reaction: The reaction object (ReactionContainer) from which to create the rule. This object
+                     represents a chemical reaction with specified reactants, products, and possibly reagents.
+    :return: A ReactionContainer object representing the extracted reaction rule. This rule includes
+             various elements of the reaction as specified by the configuration, such as reaction centers,
+             environmental atoms, functional groups, and others.
+    The function processes the reaction to create a rule that matches the configuration settings. It handles
+    the inclusion of environmental atoms, functional groups, ring structures, and leaving and incoming groups.
+    It also constructs substructures for reactants, products, and reagents, and cleans molecule representations
+    if required. Optionally, it validates the rule using a reactor.
+    """
+    cgr = ~reaction
+    center_atoms = set(cgr.center_atoms)
+    # Add atoms of reaction environment based on config settings
+    center_atoms = add_environment_atoms(
+        cgr, center_atoms, config.environment_atom_count
+    )
+    # Include functional groups in the rule if specified in config
+    if config.include_func_groups:
+        rule_atoms = add_functional_groups(
+            reaction, center_atoms, config.func_groups_list
+        )
+    else:
+        rule_atoms = center_atoms.copy()
+    # Include ring structures in the rule if specified in config
+    if config.include_rings:
+        rule_atoms = add_ring_structures(
+            cgr,
+            rule_atoms,
+        )
+    # Add leaving and incoming groups to the rule based on config settings
+    rule_atoms, meta_debug = add_leaving_incoming_groups(
+        reaction, rule_atoms, config.keep_leaving_groups, config.keep_incoming_groups
+    )
+    # Create substructures for reactants, products, and reagents
+    (
+        reactant_substructures,
+        product_substructures,
+        reagents,
+    ) = create_substructures_and_reagents(
+        reaction, rule_atoms, config.as_query_container, config.keep_reagents
+    )
+    # Clean atom marks in the molecules if they are being converted to query containers
+    if config.as_query_container:
+        reactant_substructures = clean_molecules(
+            reactant_substructures,
+            reaction.reactants,
+            center_atoms,
+            config.atom_info_retention,
+        )
+        product_substructures = clean_molecules(
+            product_substructures,
+            reaction.products,
+            center_atoms,
+            config.atom_info_retention,
+        )
+    # Assemble the final rule including metadata if specified
+    rule = assemble_final_rule(
+        reactant_substructures,
+        product_substructures,
+        reagents,
+        meta_debug,
+        config.keep_metadata,
+        reaction,
+    )
+    if config.reverse_rule:
+        rule = reverse_reaction(rule)
+        reaction = reverse_reaction(reaction)
+    # Validate the rule using a reactor if validation is enabled in config
+    if config.reactor_validation:
+        if validate_rule(rule, reaction):
+            rule.meta["reactor_validation"] = "passed"
+        else:
+            rule.meta["reactor_validation"] = "failed"
+    return rule
+def add_environment_atoms(cgr, center_atoms, environment_atom_count):
+    """
+    Adds environment atoms to the set of center atoms based on the specified depth.
+    :param cgr: A complete graph representation of a reaction (ReactionContainer object).
+    :param center_atoms: A set of atom identifiers representing the center atoms of the reaction.
+    :param environment_atom_count: An integer specifying the depth of the environment around
+                                   the reaction center to be included. If it's 0, only the
+                                   reaction center is included. If it's 1, the first layer of
+                                   surrounding atoms is included, and so on.
+    :return: A set of atom identifiers including the center atoms and their environment atoms
+             up to the specified depth. If environment_atom_count is 0, the original set of
+             center atoms is returned unchanged.
+    """
+    if environment_atom_count:
+        env_cgr = cgr.augmented_substructure(center_atoms, deep=environment_atom_count)
+        # Combine the original center atoms with the new environment atoms
+        return center_atoms | set(env_cgr)
+    # If no environment is to be included, return the original center atoms
+    return center_atoms
+def add_functional_groups(reaction, center_atoms, func_groups_list):
+    """
+    Augments the set of rule atoms with functional groups if specified.
+    :param reaction: The reaction object (ReactionContainer) from which molecules are extracted.
+    :param center_atoms: A set of atom identifiers representing the center atoms of the reaction.
+    :param func_groups_list: A list of functional group objects (MoleculeContainer or QueryContainer)
+                             to be considered when including functional groups. These objects define
+                             the structure of the functional groups to be included.
+    :return: A set of atom identifiers representing the rule atoms, including atoms from the
+             specified functional groups if include_func_groups is True. If include_func_groups
+             is False, the original set of center atoms is returned.
+    """
+    rule_atoms = center_atoms.copy()
+    # Iterate over each molecule in the reaction
+    for molecule in reaction.molecules():
+        # For each functional group specified in the list
+        for func_group in func_groups_list:
+            # Find mappings of the functional group in the molecule
+            for mapping in func_group.get_mapping(molecule):
+                # Remap the functional group based on the found mapping
+                func_group.remap(mapping)
+                # If the functional group intersects with center atoms, include it
+                if set(func_group.atoms_numbers) & center_atoms:
+                    rule_atoms |= set(func_group.atoms_numbers)
+                # Reset the mapping to its original state for the next iteration
+                func_group.remap({v: k for k, v in mapping.items()})
+    return rule_atoms
+def add_ring_structures(cgr, rule_atoms):
+    """
+    Appends ring structures to the set of rule atoms if they intersect with the reaction center atoms.
+    :param cgr: A condensed graph representation of a reaction (CGRContainer object).
+    :param rule_atoms: A set of atom identifiers representing the center atoms of the reaction.
+    :return: A set of atom identifiers including the original rule atoms and the included ring structures.
+    """
+    for ring in cgr.sssr:
+        # Check if the current ring intersects with the set of rule atoms
+        if set(ring) & rule_atoms:
+            # If the intersection exists, include all atoms in the ring to the rule atoms
+            rule_atoms |= set(ring)
+    return rule_atoms
+def add_leaving_incoming_groups(
+    reaction, rule_atoms, keep_leaving_groups, keep_incoming_groups
+):
+    """
+    Identifies and includes leaving and incoming groups to the rule atoms based on specified flags.
+    :param reaction: The reaction object (ReactionContainer) from which leaving and incoming groups are extracted.
+    :param rule_atoms: A set of atom identifiers representing the center atoms of the reaction.
+    :param keep_leaving_groups: A boolean flag indicating whether to include leaving groups in the rule.
+    :param keep_incoming_groups: A boolean flag indicating whether to include incoming groups in the rule.
+    :return: Updated set of rule atoms including leaving and incoming groups if specified, and metadata about added groups.
+    """
+    meta_debug = {"leaving": set(), "incoming": set()}
+    # Extract atoms from reactants and products
+    reactant_atoms = {atom for reactant in reaction.reactants for atom in reactant}
+    product_atoms = {atom for product in reaction.products for atom in product}
+    # Identify leaving groups (reactant atoms not in products)
+    if keep_leaving_groups:
+        leaving_atoms = reactant_atoms - product_atoms
+        new_leaving_atoms = leaving_atoms - rule_atoms
+        # Include leaving atoms in the rule atoms
+        rule_atoms |= leaving_atoms
+        # Add leaving atoms to metadata
+        meta_debug["leaving"] |= new_leaving_atoms
+    # Identify incoming groups (product atoms not in reactants)
+    if keep_incoming_groups:
+        incoming_atoms = product_atoms - reactant_atoms
+        new_incoming_atoms = incoming_atoms - rule_atoms
+        # Include incoming atoms in the rule atoms
+        rule_atoms |= incoming_atoms
+        # Add incoming atoms to metadata
+        meta_debug["incoming"] |= new_incoming_atoms
+    return rule_atoms, meta_debug
+def clean_molecules(
+    rule_molecules: Iterable[QueryContainer],
+    reaction_molecules: Iterable[MoleculeContainer],
+    reaction_center_atoms: Set[int],
+    atom_retention_details: Dict[str, Dict[str, bool]],
+) -> List[QueryContainer]:
+    """
+    Cleans rule molecules by removing specified information about atoms based on retention details provided.
+    :param rule_molecules: A list of query container objects representing the rule molecules.
+    :param reaction_molecules: A list of molecule container objects involved in the reaction.
+    :param reaction_center_atoms: A set of integers representing atom numbers in the reaction center.
+    :param atom_retention_details: A dictionary specifying what atom information to retain or remove.
+                                   This dictionary should have two keys: "reaction_center" and "environment",
+                                   each mapping to another dictionary. The nested dictionaries should have
+                                   keys representing atom attributes (like "neighbors", "hybridization",
+                                   "implicit_hydrogens", "ring_sizes") and boolean values. A value of True
+                                   indicates that the corresponding attribute should be retained,
+                                   while False indicates it should be removed from the atom.
+                                   For example:
+                                   {
+                                       "reaction_center": {"neighbors": True, "hybridization": False, ...},
+                                       "environment": {"neighbors": True, "implicit_hydrogens": False, ...}
+                                   }
+    Returns:
+        A list of QueryContainer objects representing the cleaned rule molecules.
+    """
+    cleaned_rule_molecules = []
+    for rule_molecule in rule_molecules:
+        for reaction_molecule in reaction_molecules:
+            if set(rule_molecule.atoms_numbers) <= set(reaction_molecule.atoms_numbers):
+                query_reaction_molecule = reaction_molecule.substructure(
+                    reaction_molecule, as_query=True
+                )
+                query_rule_molecule = query_reaction_molecule.substructure(
+                    rule_molecule
+                )
+                # Clean environment atoms
+                if not all(
+                    atom_retention_details["environment"].values()
+                ):  # if everything True, we keep all marks
+                    local_environment_atoms = (
+                        set(rule_molecule.atoms_numbers) - reaction_center_atoms
+                    )
+                    for atom_number in local_environment_atoms:
+                        query_rule_molecule = clean_atom(
+                            query_rule_molecule,
+                            atom_retention_details["environment"],
+                            atom_number,
+                        )
+                # Clean reaction center atoms
+                if not all(
+                    atom_retention_details["reaction_center"].values()
+                ):  # if everything True, we keep all marks
+                    local_reaction_center_atoms = (
+                        set(rule_molecule.atoms_numbers) & reaction_center_atoms
+                    )
+                    for atom_number in local_reaction_center_atoms:
+                        query_rule_molecule = clean_atom(
+                            query_rule_molecule,
+                            atom_retention_details["reaction_center"],
+                            atom_number,
+                        )
+                cleaned_rule_molecules.append(query_rule_molecule)
+                break
+    return cleaned_rule_molecules
+def clean_atom(
+    query_molecule: QueryContainer,
+    attributes_to_keep: Dict[str, bool],
+    atom_number: int,
+) -> QueryContainer:
+    """
+    Removes specified information from a given atom in a query molecule.
+    :param query_molecule: The QueryContainer of molecule.
+    :param attributes_to_keep: Dictionary indicating which attributes to keep in the atom.
+                                 The keys should be strings representing the attribute names, and
+                                 the values should be booleans indicating whether to retain (True)
+                                 or remove (False) that attribute. Expected keys are:
+                                 - "neighbors": Indicates if neighbors of the atom should be removed.
+                                 - "hybridization": Indicates if hybridization information of the atom should be removed.
+                                 - "implicit_hydrogens": Indicates if implicit hydrogen information of the atom should be removed.
+                                 - "ring_sizes": Indicates if ring size information of the atom should be removed.
+    :param atom_number: The number of the atom to be modified in the query molecule.
+    """
+    target_atom = query_molecule.atom(atom_number)
+    if not attributes_to_keep["neighbors"]:
+        target_atom.neighbors = None
+    if not attributes_to_keep["hybridization"]:
+        target_atom.hybridization = None
+    if not attributes_to_keep["implicit_hydrogens"]:
+        target_atom.implicit_hydrogens = None
+    if not attributes_to_keep["ring_sizes"]:
+        target_atom.ring_sizes = None
+    return query_molecule
+def create_substructures_and_reagents(
+    reaction, rule_atoms, as_query_container, keep_reagents
+):
+    """
+    Creates substructures for reactants and products, and optionally includes reagents, based on specified parameters.
+    :param reaction: The reaction object (ReactionContainer) from which to extract substructures. This object
+                     represents a chemical reaction with specified reactants, products, and possibly reagents.
+    :param rule_atoms: A set of atom identifiers that define the rule atoms. These are used to identify relevant
+                       substructures in reactants and products.
+    :param as_query_container: A boolean flag indicating whether the substructures should be converted to query containers.
+                               Query containers are used for pattern matching in chemical structures.
+    :param keep_reagents: A boolean flag indicating whether reagents should be included in the resulting structures.
+                          Reagents are additional substances that are present in the reaction but are not reactants or products.
+    :return: A tuple containing three elements:
+             - A list of reactant substructures, each corresponding to a part of the reactants that matches the rule atoms.
+             - A list of product substructures, each corresponding to a part of the products that matches the rule atoms.
+             - A list of reagents, included as is or as substructures, depending on the as_query_container flag.
+    The function processes the reaction to create substructures for reactants and products based on the rule atoms.
+    It also handles the inclusion of reagents based on the keep_reagents flag and converts these structures to query
+    containers if required.
+    """
+    reactant_substructures = [
+        reactant.substructure(rule_atoms.intersection(reactant.atoms_numbers))
+        for reactant in reaction.reactants
+        if rule_atoms.intersection(reactant.atoms_numbers)
+    ]
+    product_substructures = [
+        product.substructure(rule_atoms.intersection(product.atoms_numbers))
+        for product in reaction.products
+        if rule_atoms.intersection(product.atoms_numbers)
+    ]
+    reagents = []
+    if keep_reagents:
+        if as_query_container:
+            reagents = [
+                reagent.substructure(reagent, as_query=True)
+                for reagent in reaction.reagents
+            ]
+        else:
+            reagents = reaction.reagents
+    return reactant_substructures, product_substructures, reagents
+def assemble_final_rule(
+    reactant_substructures,
+    product_substructures,
+    reagents,
+    meta_debug,
+    keep_metadata,
+    reaction,
+):
+    """
+    Assembles the final reaction rule from the provided substructures and metadata.
+    :param reactant_substructures: A list of substructures derived from the reactants of the reaction.
+                                   These substructures represent parts of reactants that are relevant to the rule.
+    :param product_substructures: A list of substructures derived from the products of the reaction.
+                                  These substructures represent parts of products that are relevant to the rule.
+    :param reagents: A list of reagents involved in the reaction. These may be included as-is or as substructures,
+                     depending on earlier processing steps.
+    :param meta_debug: A dictionary containing additional metadata about the reaction, such as leaving and incoming groups.
+    :param keep_metadata: A boolean flag indicating whether to retain the metadata associated with the reaction in the rule.
+    :param reaction: The original reaction object (ReactionContainer) from which the rule is being created.
+    :return: A ReactionContainer object representing the assembled reaction rule. This container includes
+             the reactant and product substructures, reagents, and any additional metadata if keep_metadata is True.
+    This function brings together the various components of a reaction rule, including reactant and product substructures,
+    reagents, and metadata. It creates a comprehensive representation of the reaction rule, which can be used for further
+    processing or analysis.
+    """
+    rule_metadata = meta_debug if keep_metadata else {}
+    rule_metadata.update(reaction.meta if keep_metadata else {})
+    rule = ReactionContainer(
+        reactant_substructures, product_substructures, reagents, rule_metadata
+    )
+    if keep_metadata:
+        rule.name = reaction.name
+    rule.flush_cache()
+    return rule
+def validate_rule(rule: ReactionContainer, reaction: ReactionContainer):
+    """
+    Validates a reaction rule by ensuring it can correctly generate the products from the reactants.
+    :param rule: The reaction rule to be validated. This is a ReactionContainer object representing a chemical reaction rule,
+                 which includes the necessary information to perform a reaction.
+    :param reaction: The original reaction object (ReactionContainer) against which the rule is to be validated. This object
+                     contains the actual reactants and products of the reaction.
+    :return: The validated rule if the rule correctly generates the products from the reactants.
+    :raises ValueError: If the rule does not correctly generate the products from the reactants, indicating
+                        an incorrect or incomplete rule.
+    The function uses a chemical reactor to simulate the reaction based on the provided rule. It then compares
+    the products generated by the simulation with the actual products of the reaction. If they match, the rule
+    is considered valid. If not, a ValueError is raised, indicating an issue with the rule.
+    """
+    # Create a reactor with the given rule
+    reactor = Reactor(rule)
+    try:
+        for result_reaction in reactor(reaction.reactants):
+            result_products = []
+            for result_product in result_reaction.products:
+                tmp = result_product.copy()
+                try:
+                    tmp.kekule()
+                    if tmp.check_valence():
+                        continue
+                except InvalidAromaticRing:
+                    continue
+                result_products.append(result_product)
+            if set(reaction.products) == set(result_products) and len(
+                reaction.products
+            ) == len(result_products):
+                return True
+    except (KeyError, IndexError):
+        # KeyError - iteration over reactor is finished and products are different from the original reaction
+        # IndexError - mistake in __contract_ions, possibly problems with charges in rule?
+        return False
+def sort_rules(
+    rules_stats: Dict[ReactionContainer, List[int]],
+    min_popularity: int = 3,
+    single_reactant_only: bool = True,
+) -> List[Tuple[ReactionContainer, List[int]]]:
+    """
+    Sorts reaction rules based on their popularity and validation status.
+    This function sorts the given rules according to their popularity (i.e., the number of times they have been
+    applied) and filters out rules that haven't passed reactor validation or are less popular than the specified
+    minimum popularity threshold.
+    :param rules_stats: A dictionary where each key is a reaction rule and the value is a list of integers.
+                        Each integer represents an index where the rule was applied.
+    :type rules_stats: Dict[ReactionContainer, List[int]]
+    :param min_popularity: The minimum number of times a rule must be applied to be considered. Default is 3.
+    :type min_popularity: int
+    :param single_reactant_only: Whether to keep only reaction rules with a single molecule on the right side
+    of reaction arrow. Default is True.
+    :return: A list of tuples, where each tuple contains a reaction rule and a list of indices representing
+             the rule's applications. The list is sorted in descending order of the rule's popularity.
+    :rtype: List[Tuple[ReactionContainer, List[int]]]
+    """
+    return sorted(
+        (
+            (rule, indices)
+            for rule, indices in rules_stats.items()
+            if len(indices) >= min_popularity
+            and rule.meta["reactor_validation"] == "passed"
+            and (not single_reactant_only or len(rule.reactants) == 1)
+        ),
+        key=lambda x: -len(x[1]),
+    )

SynTool/chem/reaction_rules/manual/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .decompositions import rules as d_rules
+from .transformations import rules as t_rules
+hardcoded_rules = t_rules + d_rules
+__all__ = ["hardcoded_rules"]

SynTool/chem/reaction_rules/manual/decompositions.py ADDED Viewed

	@@ -0,0 +1,415 @@

+"""
+Module containing hardcoded decomposition reaction rules
+"""
+from CGRtools import QueryContainer, ReactionContainer
+from CGRtools.periodictable import ListElement
+rules = []
+def prepare():
+    """
+    Creates and returns three query containers and appends a reaction container to the "rules" list
+    """
+    q_ = QueryContainer()
+    p1_ = QueryContainer()
+    p2_ = QueryContainer()
+    rules.append(ReactionContainer((q_,), (p1_, p2_)))
+    return q_, p1_, p2_
+# R-amide/ester formation
+# [C](-[N,O;D23;Zs])(-[C])=[O]>>[A].[C]-[C](-[O])=[O]
+q, p1, p2 = prepare()
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('O')
+q.add_atom(ListElement(['N', 'O']), hybridization=1, neighbors=(2, 3))
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 2)
+q.add_bond(2, 4, 1)
+p1.add_atom('C')
+p1.add_atom('C')
+p1.add_atom('O')
+p1.add_atom('O', _map=5)
+p1.add_bond(1, 2, 1)
+p1.add_bond(2, 3, 2)
+p1.add_bond(2, 5, 1)
+p2.add_atom('A', _map=4)
+# acyl group addition with aromatic carbon's case (Friedel-Crafts)
+# [C;Za]-[C](-[C])=[O]>>[C].[C]-[C](-[Cl])=[O]
+q, p1, p2 = prepare()
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('O')
+q.add_atom('C', hybridization=4)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 2)
+q.add_bond(2, 4, 1)
+p1.add_atom('C')
+p1.add_atom('C')
+p1.add_atom('O')
+p1.add_atom('Cl', _map=5)
+p1.add_bond(1, 2, 1)
+p1.add_bond(2, 3, 2)
+p1.add_bond(2, 5, 1)
+p2.add_atom('C', _map=4)
+# Williamson reaction
+# [C;Za]-[O]-[C;Zs;W0]>>[C]-[Br].[C]-[O]
+q, p1, p2 = prepare()
+q.add_atom('C', hybridization=4)
+q.add_atom('O')
+q.add_atom('C', hybridization=1, heteroatoms=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+p1.add_atom('C')
+p1.add_atom('O')
+p1.add_bond(1, 2, 1)
+p2.add_atom('C', _map=3)
+p2.add_atom('Br')
+p2.add_bond(3, 4, 1)
+# Buchwald-Hartwig amination
+# [N;D23;Zs;W0]-[C;Za]>>[C]-[Br].[N]
+q, p1, p2 = prepare()
+q.add_atom('N', heteroatoms=0, hybridization=1, neighbors=(2, 3))
+q.add_atom('C', hybridization=4)
+q.add_bond(1, 2, 1)
+p1.add_atom('C', _map=2)
+p1.add_atom('Br')
+p1.add_bond(2, 3, 1)
+p2.add_atom('N')
+# imidazole imine atom's alkylation
+# [C;r5](:[N;r5]-[C;Zs;W1]):[N;D2;r5]>>[C]-[Br].[N]:[C]:[N]
+q, p1, p2 = prepare()
+q.add_atom('N', rings_sizes=5)
+q.add_atom('C', rings_sizes=5)
+q.add_atom('N', rings_sizes=5, neighbors=2)
+q.add_atom('C', hybridization=1, heteroatoms=(1, 2))
+q.add_bond(1, 2, 4)
+q.add_bond(2, 3, 4)
+q.add_bond(1, 4, 1)
+p1.add_atom('N')
+p1.add_atom('C')
+p1.add_atom('N')
+p1.add_bond(1, 2, 4)
+p1.add_bond(2, 3, 4)
+p2.add_atom('C', _map=4)
+p2.add_atom('Br')
+p2.add_bond(4, 5, 1)
+# Knoevenagel condensation (nitryl and carboxyl case)
+# [C]=[C](-[C]#[N])-[C](-[O])=[O]>>[C]=[O].[C](-[C]#[N])-[C](-[O])=[O]
+q, p1, p2 = prepare()
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('N')
+q.add_atom('C')
+q.add_atom('O')
+q.add_atom('O')
+q.add_bond(1, 2, 2)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 3)
+q.add_bond(2, 5, 1)
+q.add_bond(5, 6, 2)
+q.add_bond(5, 7, 1)
+p1.add_atom('C', _map=2)
+p1.add_atom('C')
+p1.add_atom('N')
+p1.add_atom('C')
+p1.add_atom('O')
+p1.add_atom('O')
+p1.add_bond(2, 3, 1)
+p1.add_bond(3, 4, 3)
+p1.add_bond(2, 5, 1)
+p1.add_bond(5, 6, 2)
+p1.add_bond(5, 7, 1)
+p2.add_atom('C', _map=1)
+p2.add_atom('O', _map=8)
+p2.add_bond(1, 8, 2)
+# Knoevenagel condensation (double nitryl case)
+# [C]=[C](-[C]#[N])-[C]#[N]>>[C]=[O].[C](-[C]#[N])-[C]#[N]
+q, p1, p2 = prepare()
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('N')
+q.add_atom('C')
+q.add_atom('N')
+q.add_bond(1, 2, 2)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 3)
+q.add_bond(2, 5, 1)
+q.add_bond(5, 6, 3)
+p1.add_atom('C', _map=2)
+p1.add_atom('C')
+p1.add_atom('N')
+p1.add_atom('C')
+p1.add_atom('N')
+p1.add_bond(2, 3, 1)
+p1.add_bond(3, 4, 3)
+p1.add_bond(2, 5, 1)
+p1.add_bond(5, 6, 3)
+p2.add_atom('C', _map=1)
+p2.add_atom('O', _map=8)
+p2.add_bond(1, 8, 2)
+# Knoevenagel condensation (double carboxyl case)
+# [C]=[C](-[C](-[O])=[O])-[C](-[O])=[O]>>[C]=[O].[C](-[C](-[O])=[O])-[C](-[O])=[O]
+q, p1, p2 = prepare()
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('O')
+q.add_atom('O')
+q.add_atom('C')
+q.add_atom('O')
+q.add_atom('O')
+q.add_bond(1, 2, 2)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 2)
+q.add_bond(3, 5, 1)
+q.add_bond(2, 6, 1)
+q.add_bond(6, 7, 2)
+q.add_bond(6, 8, 1)
+p1.add_atom('C', _map=2)
+p1.add_atom('C')
+p1.add_atom('O')
+p1.add_atom('O')
+p1.add_atom('C')
+p1.add_atom('O')
+p1.add_atom('O')
+p1.add_bond(2, 3, 1)
+p1.add_bond(3, 4, 2)
+p1.add_bond(3, 5, 1)
+p1.add_bond(2, 6, 1)
+p1.add_bond(6, 7, 2)
+p1.add_bond(6, 8, 1)
+p2.add_atom('C', _map=1)
+p2.add_atom('O', _map=9)
+p2.add_bond(1, 9, 2)
+# heterocyclization with guanidine
+# [c]((-[N;W0;Zs])@[n]@[c](-[N;D1])@[c;W0])@[n]@[c]-[O; D1]>>[C](-[N])(=[N])-[N].[C](#[N])-[C]-[C](-[O])=[O]
+q, p1, p2 = prepare()
+q.add_atom('C')
+q.add_atom('N', heteroatoms=0, hybridization=1)
+q.add_atom('N')
+q.add_atom('C')
+q.add_atom('N', neighbors=1)
+q.add_atom('C', heteroatoms=0)
+q.add_atom('N')
+q.add_atom('C')
+q.add_atom('O', neighbors=1)
+q.add_bond(1, 2, 1)
+q.add_bond(1, 3, 4)
+q.add_bond(3, 4, 4)
+q.add_bond(4, 5, 1)
+q.add_bond(4, 6, 4)
+q.add_bond(1, 7, 4)
+q.add_bond(7, 8, 4)
+q.add_bond(8, 9, 1)
+p1.add_atom('C')
+p1.add_atom('N')
+p1.add_atom('N')
+p1.add_atom('N', _map=7)
+p1.add_bond(1, 2, 1)
+p1.add_bond(1, 3, 2)
+p1.add_bond(1, 7, 1)
+p2.add_atom('C', _map=4)
+p2.add_atom('N')
+p2.add_atom('C')
+p2.add_atom('C', _map=8)
+p2.add_atom('O', _map=9)
+p2.add_atom('O')
+p2.add_bond(4, 5, 3)
+p2.add_bond(4, 6, 1)
+p2.add_bond(6, 8, 1)
+p2.add_bond(8, 9, 2)
+p2.add_bond(8, 10, 1)
+# alkylation of amine
+# [C]-[N]-[C]>>[C]-[N].[C]-[Br]
+q, p1, p2 = prepare()
+q.add_atom('C')
+q.add_atom('N')
+q.add_atom('C')
+q.add_atom('C')
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+q.add_bond(2, 4, 1)
+p1.add_atom('C')
+p1.add_atom('N')
+p1.add_atom('C')
+p1.add_bond(1, 2, 1)
+p1.add_bond(2, 3, 1)
+p2.add_atom('C', _map=4)
+p2.add_atom('Cl')
+p2.add_bond(4, 5, 1)
+# Synthesis of guanidines
+#
+q, p1, p2 = prepare()
+q.add_atom('N')
+q.add_atom('C')
+q.add_atom('N', hybridization=1)
+q.add_atom('N', hybridization=1)
+q.add_bond(1, 2, 2)
+q.add_bond(2, 3, 1)
+q.add_bond(2, 4, 1)
+p1.add_atom('N')
+p1.add_atom('C')
+p1.add_atom('N')
+p1.add_bond(1, 2, 3)
+p1.add_bond(2, 3, 1)
+p2.add_atom('N', _map=4)
+# Grignard reaction with nitrile
+#
+q, p1, p2 = prepare()
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('O')
+q.add_atom('C')
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 2)
+q.add_bond(2, 4, 1)
+p1.add_atom('C')
+p1.add_atom('C')
+p1.add_atom('N')
+p1.add_bond(1, 2, 1)
+p1.add_bond(2, 3, 3)
+p2.add_atom('C', _map=4)
+p2.add_atom('Br')
+p2.add_bond(4, 5, 1)
+# Alkylation of alpha-carbon atom of nitrile
+#
+q, p1, p2 = prepare()
+q.add_atom('N')
+q.add_atom('C')
+q.add_atom('C', neighbors=(3, 4))
+q.add_atom('C', hybridization=1)
+q.add_bond(1, 2, 3)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 1)
+p1.add_atom('N')
+p1.add_atom('C')
+p1.add_atom('C')
+p1.add_bond(1, 2, 3)
+p1.add_bond(2, 3, 1)
+p2.add_atom('C', _map=4)
+p2.add_atom('Cl')
+p2.add_bond(4, 5, 1)
+# Gomberg-Bachmann reaction
+#
+q, p1, p2 = prepare()
+q.add_atom('C', hybridization=4, heteroatoms=0)
+q.add_atom('C', hybridization=4, heteroatoms=0)
+q.add_bond(1, 2, 1)
+p1.add_atom('C')
+p1.add_atom('N', _map=3)
+p1.add_bond(1, 3, 1)
+p2.add_atom('C', _map=2)
+# Cyclocondensation
+#
+q, p1, p2 = prepare()
+q.add_atom('N', neighbors=2)
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('N')
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('O', neighbors=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 1)
+q.add_bond(4, 5, 2)
+q.add_bond(5, 6, 1)
+q.add_bond(6, 7, 1)
+q.add_bond(7, 8, 2)
+q.add_bond(1, 7, 1)
+p1.add_atom('N')
+p1.add_atom('C')
+p1.add_atom('C')
+p1.add_atom('C')
+p1.add_atom('O', _map=9)
+p1.add_bond(1, 2, 1)
+p1.add_bond(2, 3, 1)
+p1.add_bond(3, 4, 1)
+p1.add_bond(4, 9, 2)
+p2.add_atom('N', _map=5)
+p2.add_atom('C')
+p2.add_atom('C')
+p2.add_atom('O')
+p2.add_atom('O', _map=10)
+p2.add_bond(5, 6, 1)
+p2.add_bond(6, 7, 1)
+p2.add_bond(7, 8, 2)
+p2.add_bond(7, 10, 1)
+# heterocyclization dicarboxylic acids
+#
+q, p1, p2 = prepare()
+q.add_atom('C', rings_sizes=(5, 6))
+q.add_atom('O')
+q.add_atom(ListElement(['O', 'N']))
+q.add_atom('C', rings_sizes=(5, 6))
+q.add_atom('O')
+q.add_bond(1, 2, 2)
+q.add_bond(1, 3, 1)
+q.add_bond(3, 4, 1)
+q.add_bond(4, 5, 2)
+p1.add_atom('C')
+p1.add_atom('O')
+p1.add_atom('O', _map=6)
+p1.add_bond(1, 2, 2)
+p1.add_bond(1, 6, 1)
+p2.add_atom('C', _map=4)
+p2.add_atom('O')
+p2.add_atom('O', _map=7)
+p2.add_bond(4, 5, 2)
+p2.add_bond(4, 7, 1)
+__all__ = ['rules']

SynTool/chem/reaction_rules/manual/transformations.py ADDED Viewed

	@@ -0,0 +1,535 @@

+"""
+Module containing hardcoded transformation reaction rules
+"""
+from CGRtools import QueryContainer, ReactionContainer
+from CGRtools.periodictable import ListElement
+rules = []
+def prepare():
+    """
+    Creates and returns three query containers and appends a reaction container to the "rules" list
+    """
+    q_ = QueryContainer()
+    p_ = QueryContainer()
+    rules.append(ReactionContainer((q_,), (p_,)))
+    return q_, p_
+# aryl nitro reduction
+# [C;Za;W1]-[N;D1]>>[O-]-[N+](-[C])=[O]
+q, p = prepare()
+q.add_atom('N', neighbors=1)
+q.add_atom('C', hybridization=4, heteroatoms=1)
+q.add_bond(1, 2, 1)
+p.add_atom('N', charge=1)
+p.add_atom('C')
+p.add_atom('O', charge=-1)
+p.add_atom('O')
+p.add_bond(1, 2, 1)
+p.add_bond(1, 3, 1)
+p.add_bond(1, 4, 2)
+# aryl nitration
+# [O-]-[N+](=[O])-[C;Za;W12]>>[C]
+q, p = prepare()
+q.add_atom('N', charge=1)
+q.add_atom('C', hybridization=4, heteroatoms=(1, 2))
+q.add_atom('O', charge=-1)
+q.add_atom('O')
+q.add_bond(1, 2, 1)
+q.add_bond(1, 3, 1)
+q.add_bond(1, 4, 2)
+p.add_atom('C', _map=2)
+# Beckmann rearrangement (oxime -> amide)
+# [C]-[N;D2]-[C]=[O]>>[O]-[N]=[C]-[C]
+q, p = prepare()
+q.add_atom('C')
+q.add_atom('N', neighbors=2)
+q.add_atom('O')
+q.add_atom('C')
+q.add_bond(1, 2, 1)
+q.add_bond(1, 3, 2)
+q.add_bond(2, 4, 1)
+p.add_atom('C')
+p.add_atom('N')
+p.add_atom('O')
+p.add_atom('C')
+p.add_bond(1, 2, 2)
+p.add_bond(2, 3, 1)
+p.add_bond(1, 4, 1)
+# aldehydes or ketones into oxime/imine reaction
+# [C;Zd;W1]=[N]>>[C]=[O]
+q, p = prepare()
+q.add_atom('C', hybridization=2, heteroatoms=1)
+q.add_atom('N')
+q.add_bond(1, 2, 2)
+p.add_atom('C')
+p.add_atom('O', _map=3)
+p.add_bond(1, 3, 2)
+# addition of halogen atom into phenol ring (orto)
+# [C](-[Cl,F,Br,I;D1]):[C]-[O,N;Zs]>>[C](-[A]):[C]
+q, p = prepare()
+q.add_atom(ListElement(['O', 'N']), hybridization=1)
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom(ListElement(['Cl', 'F', 'Br', 'I']), neighbors=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 4)
+q.add_bond(3, 4, 1)
+p.add_atom('A')
+p.add_atom('C')
+p.add_atom('C')
+p.add_bond(1, 2, 1)
+p.add_bond(2, 3, 4)
+# addition of halogen atom into phenol ring (para)
+# [C](:[C]:[C]:[C]-[O,N;Zs])-[Cl,F,Br,I;D1]>>[A]-[C]:[C]:[C]:[C]
+q, p = prepare()
+q.add_atom(ListElement(['O', 'N']), hybridization=1)
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom(ListElement(['Cl', 'F', 'Br', 'I']), neighbors=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 4)
+q.add_bond(3, 4, 4)
+q.add_bond(4, 5, 4)
+q.add_bond(5, 6, 1)
+p.add_atom('A')
+p.add_atom('C')
+p.add_atom('C')
+p.add_atom('C')
+p.add_atom('C')
+p.add_bond(1, 2, 1)
+p.add_bond(2, 3, 4)
+p.add_bond(3, 4, 4)
+p.add_bond(4, 5, 4)
+# hard reduction of Ar-ketones
+# [C;Za]-[C;D2;Zs;W0]>>[C]-[C]=[O]
+q, p = prepare()
+q.add_atom('C', hybridization=4)
+q.add_atom('C', hybridization=1, neighbors=2, heteroatoms=0)
+q.add_bond(1, 2, 1)
+p.add_atom('C')
+p.add_atom('C')
+p.add_atom('O')
+p.add_bond(1, 2, 1)
+p.add_bond(2, 3, 2)
+# reduction of alpha-hydroxy pyridine
+# [C;W1]:[N;H0;r6]>>[C](:[N])-[O]
+q, p = prepare()
+q.add_atom('C', heteroatoms=1)
+q.add_atom('N', rings_sizes=6, hydrogens=0)
+q.add_bond(1, 2, 4)
+p.add_atom('C')
+p.add_atom('N')
+p.add_atom('O')
+p.add_bond(1, 2, 4)
+p.add_bond(1, 3, 1)
+# Reduction of alkene
+# [C]-[C;D23;Zs;W0]-[C;D123;Zs;W0]>>[C](-[C])=[C]
+q, p = prepare()
+q.add_atom('C')
+q.add_atom('C', heteroatoms=0, neighbors=(2, 3), hybridization=1)
+q.add_atom('C', heteroatoms=0, neighbors=(1, 2, 3), hybridization=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+p.add_atom('C')
+p.add_atom('C')
+p.add_atom('C')
+p.add_bond(1, 2, 1)
+p.add_bond(2, 3, 2)
+# Kolbe-Schmitt reaction
+# [C](:[C]-[O;D1])-[C](=[O])-[O;D1]>>[C](-[O]):[C]
+q, p = prepare()
+q.add_atom('O', neighbors=1)
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('O', neighbors=1)
+q.add_atom('O')
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 4)
+q.add_bond(3, 4, 1)
+q.add_bond(4, 5, 1)
+q.add_bond(4, 6, 2)
+p.add_atom('O')
+p.add_atom('C')
+p.add_atom('C')
+p.add_bond(1, 2, 1)
+p.add_bond(2, 3, 4)
+# reduction of carboxylic acid
+# [O;D1]-[C;D2]-[C]>>[C]-[C](-[O])=[O]
+q, p = prepare()
+q.add_atom('C')
+q.add_atom('C', neighbors=2)
+q.add_atom('O', neighbors=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+p.add_atom('C')
+p.add_atom('C')
+p.add_atom('O')
+p.add_atom('O')
+p.add_bond(1, 2, 1)
+p.add_bond(2, 3, 1)
+p.add_bond(2, 4, 2)
+# halogenation of alcohols
+# [C;Zs]-[Cl,Br;D1]>>[C]-[O]
+q, p = prepare()
+q.add_atom('C', hybridization=1, heteroatoms=1)
+q.add_atom(ListElement(['Cl', 'Br']), neighbors=1)
+q.add_bond(1, 2, 1)
+p.add_atom('C')
+p.add_atom('O', _map=3)
+p.add_bond(1, 3, 1)
+# Kolbe nitrilation
+# [N]#[C]-[C;Zs;W0]>>[Br]-[C]
+q, p = prepare()
+q.add_atom('C', heteroatoms=0, hybridization=1)
+q.add_atom('C')
+q.add_atom('N')
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 3)
+p.add_atom('C')
+p.add_atom('Br', _map=4)
+p.add_bond(1, 4, 1)
+# Nitrile hydrolysis
+# [O;D1]-[C]=[O]>>[N]#[C]
+q, p = prepare()
+q.add_atom('C')
+q.add_atom('O', neighbors=1)
+q.add_atom('O')
+q.add_bond(1, 2, 1)
+q.add_bond(1, 3, 2)
+p.add_atom('C')
+p.add_atom('N', _map=4)
+p.add_bond(1, 4, 3)
+# sulfamidation
+# [c]-[S](=[O])(=[O])-[N]>>[c]
+q, p = prepare()
+q.add_atom('C', hybridization=4)
+q.add_atom('S')
+q.add_atom('O')
+q.add_atom('O')
+q.add_atom('N', neighbors=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 2)
+q.add_bond(2, 4, 2)
+q.add_bond(2, 5, 1)
+p.add_atom('C')
+# Ring expansion rearrangement
+#
+q, p = prepare()
+q.add_atom('C')
+q.add_atom('N')
+q.add_atom('C', rings_sizes=6)
+q.add_atom('C')
+q.add_atom('O')
+q.add_atom('C')
+q.add_atom('C')
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 1)
+q.add_bond(4, 5, 2)
+q.add_bond(3, 6, 1)
+q.add_bond(4, 7, 1)
+p.add_atom('C')
+p.add_atom('N')
+p.add_atom('C')
+p.add_atom('C')
+p.add_atom('O')
+p.add_atom('C')
+p.add_atom('C')
+p.add_bond(1, 2, 1)
+p.add_bond(2, 3, 2)
+p.add_bond(3, 4, 1)
+p.add_bond(4, 5, 1)
+p.add_bond(4, 6, 1)
+p.add_bond(4, 7, 1)
+# hydrolysis of bromide alkyl
+#
+q, p = prepare()
+q.add_atom('C', hybridization=1)
+q.add_atom('O', neighbors=1)
+q.add_bond(1, 2, 1)
+p.add_atom('C')
+p.add_atom('Br')
+p.add_bond(1, 2, 1)
+# Condensation of ketones/aldehydes and amines into imines
+#
+q, p = prepare()
+q.add_atom('N', neighbors=(1, 2))
+q.add_atom('C', neighbors=(2, 3), heteroatoms=1)
+q.add_bond(1, 2, 2)
+p.add_atom('C', _map=2)
+p.add_atom('O')
+p.add_bond(2, 3, 2)
+# Halogenation of alkanes
+#
+q, p = prepare()
+q.add_atom('C', hybridization=1)
+q.add_atom(ListElement(['F', 'Cl', 'Br']))
+q.add_bond(1, 2, 1)
+p.add_atom('C')
+# heterocyclization
+#
+q, p = prepare()
+q.add_atom('N', heteroatoms=0, hybridization=1, neighbors=(2, 3))
+q.add_atom('C', heteroatoms=2)
+q.add_atom('N', heteroatoms=0, neighbors=2)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 2)
+p.add_atom('N')
+p.add_atom('C')
+p.add_atom('N')
+p.add_atom('O')
+p.add_bond(1, 2, 1)
+p.add_bond(2, 4, 2)
+# Reduction of nitrile
+#
+q, p = prepare()
+q.add_atom('N', neighbors=1)
+q.add_atom('C')
+q.add_atom('C', hybridization=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+p.add_atom('N')
+p.add_atom('C')
+p.add_atom('C')
+p.add_bond(1, 2, 3)
+p.add_bond(2, 3, 1)
+# SPECIAL CASE
+# Reduction of nitrile into methylamine
+#
+q, p = prepare()
+q.add_atom('C', neighbors=1)
+q.add_atom('N', neighbors=2)
+q.add_atom('C')
+q.add_atom('C', hybridization=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 1)
+p.add_atom('N', _map=2)
+p.add_atom('C')
+p.add_atom('C')
+p.add_bond(2, 3, 3)
+p.add_bond(3, 4, 1)
+# methylation of amides
+#
+q, p = prepare()
+q.add_atom('O')
+q.add_atom('C')
+q.add_atom('N')
+q.add_atom('C', neighbors=1)
+q.add_bond(1, 2, 2)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 1)
+p.add_atom('O')
+p.add_atom('C')
+p.add_atom('N')
+p.add_bond(1, 2, 2)
+p.add_bond(2, 3, 1)
+# hydrocyanation of alkenes
+#
+q, p = prepare()
+q.add_atom('C', hybridization=1)
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('N')
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 3)
+p.add_atom('C')
+p.add_atom('C')
+p.add_bond(1, 2, 2)
+# decarbocylation (alpha atom of nitrile)
+#
+q, p = prepare()
+q.add_atom('N')
+q.add_atom('C')
+q.add_atom('C', neighbors=2)
+q.add_bond(1, 2, 3)
+q.add_bond(2, 3, 1)
+p.add_atom('N')
+p.add_atom('C')
+p.add_atom('C')
+p.add_atom('C')
+p.add_atom('O')
+p.add_atom('O')
+p.add_bond(1, 2, 3)
+p.add_bond(2, 3, 1)
+p.add_bond(3, 4, 1)
+p.add_bond(4, 5, 2)
+p.add_bond(4, 6, 1)
+# Bichler-Napieralski reaction
+#
+q, p = prepare()
+q.add_atom('C', rings_sizes=(6,))
+q.add_atom('C', rings_sizes=(6,))
+q.add_atom('N', rings_sizes=(6,), neighbors=2)
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('O')
+q.add_atom('O')
+q.add_atom('C')
+q.add_atom('O', neighbors=1)
+q.add_bond(1, 2, 4)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 1)
+q.add_bond(4, 5, 2)
+q.add_bond(5, 6, 1)
+q.add_bond(6, 7, 2)
+q.add_bond(6, 8, 1)
+q.add_bond(5, 9, 4)
+q.add_bond(9, 10, 1)
+q.add_bond(1, 9, 1)
+p.add_atom('C')
+p.add_atom('C')
+p.add_atom('N')
+p.add_atom('C')
+p.add_atom('C')
+p.add_atom('C')
+p.add_atom('O')
+p.add_atom('O')
+p.add_atom('C')
+p.add_atom('O')
+p.add_atom('O')
+p.add_bond(1, 2, 4)
+p.add_bond(2, 3, 1)
+p.add_bond(3, 4, 1)
+p.add_bond(4, 5, 2)
+p.add_bond(5, 6, 1)
+p.add_bond(6, 7, 2)
+p.add_bond(6, 8, 1)
+p.add_bond(5, 9, 1)
+p.add_bond(9, 10, 2)
+p.add_bond(9, 11, 1)
+# heterocyclization in Prins reaction
+#
+q, p = prepare()
+q.add_atom('C')
+q.add_atom('O')
+q.add_atom('C')
+q.add_atom(ListElement(['N', 'O']), neighbors=2)
+q.add_atom('C')
+q.add_atom('C')
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 1)
+q.add_bond(4, 5, 1)
+q.add_bond(5, 6, 1)
+q.add_bond(1, 6, 1)
+p.add_atom('C')
+p.add_atom('C', _map=5)
+p.add_bond(1, 5, 2)
+# recyclization of tetrahydropyran through an opening the ring and dehydration
+#
+q, p = prepare()
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom('C')
+q.add_atom(ListElement(['N', 'O']))
+q.add_atom('C')
+q.add_atom('C')
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+q.add_bond(3, 4, 1)
+q.add_bond(4, 5, 1)
+q.add_bond(5, 6, 1)
+q.add_bond(1, 6, 2)
+p.add_atom('C')
+p.add_atom('C')
+p.add_atom('C')
+p.add_atom('A')
+p.add_atom('C')
+p.add_atom('C')
+p.add_atom('O')
+p.add_bond(1, 2, 1)
+p.add_bond(1, 7, 1)
+p.add_bond(3, 7, 1)
+p.add_bond(3, 4, 1)
+p.add_bond(4, 5, 1)
+p.add_bond(5, 6, 1)
+p.add_bond(1, 6, 1)
+# alkenes + h2o/hHal
+#
+q, p = prepare()
+q.add_atom('C', hybridization=1)
+q.add_atom('C', hybridization=1)
+q.add_atom(ListElement(['O', 'F', 'Cl', 'Br', 'I']), neighbors=1)
+q.add_bond(1, 2, 1)
+q.add_bond(2, 3, 1)
+p.add_atom('C')
+p.add_atom('C')
+p.add_bond(1, 2, 2)
+# methylation of dimethylamines
+#
+q, p = prepare()
+q.add_atom('C', neighbors=1)
+q.add_atom('N', neighbors=3)
+q.add_bond(1, 2, 1)
+p.add_atom('N', _map=2)
+__all__ = ['rules']

SynTool/chem/retron.py ADDED Viewed

	@@ -0,0 +1,132 @@

+"""
+Module containing a class Retron that represents a retron (extend molecule object) in the search tree
+"""
+from CGRtools.containers import MoleculeContainer
+from CGRtools.exceptions import InvalidAromaticRing
+from SynTool.chem.utils import safe_canonicalization
+class Retron:
+    """
+    Retron class is used to extend the molecule behavior needed for interaction with a tree in MCTS
+    """
+    def __init__(self, molecule: MoleculeContainer, canonicalize: bool = True):
+        """
+        It initializes a Retron object with a molecule container as a parameter.
+        :param molecule: The `molecule` parameter is of type `MoleculeContainer`.
+        :type molecule: MoleculeContainer
+        """
+        self._molecule = safe_canonicalization(molecule) if canonicalize else molecule
+        self._mapping = None
+        self.prev_retrons = []
+    def __len__(self):
+        """
+        Return the number of atoms in Retron.
+        """
+        return len(self._molecule)
+    def __hash__(self):
+        """
+        Returns the hash value of Retron.
+        """
+        return hash(self._molecule)
+    def __str__(self):
+        return str(self._molecule)
+    def __eq__(self, other: "Retron"):
+        """
+        The function checks if the current Retron is equal to another Retron of the same class.
+        :param other: The "other" parameter is a reference to another object of the same class "Retron". It is used to
+        compare the current Retron with the other Retron to check if they are equal
+        :type other: "Retron"
+        """
+        return self._molecule == other._molecule
+    def validate_molecule(self):
+        molecule = self._molecule.copy()
+        try:
+            molecule.kekule()
+            if molecule.check_valence():
+                return False
+            molecule.thiele()
+        except InvalidAromaticRing:
+            return False
+        return True
+    @property
+    def molecule(self) -> MoleculeContainer:
+        """
+        Returns a remapped MoleculeContainer object if self._mapping=True.
+        """
+        if self._mapping:
+            remapped = self._molecule.copy()
+            try:
+                remapped = self._molecule.remap(self._mapping, copy=True)
+            except ValueError:
+                pass
+            return remapped
+        return self._molecule.copy()
+    def __repr__(self):
+        """
+        Returns a SMILES of the retron
+        """
+        return str(self._molecule)
+    def is_building_block(self, stock, min_mol_size=6):
+        """
+        The function checks if a Retron is a building block.
+        :param min_mol_size:
+        :param stock: The list of building blocks. Each building block is represented by a smiles.
+        """
+        if len(self._molecule) <= min_mol_size:
+            return True
+        else:
+            return str(self._molecule) in stock
+def compose_retrons(retrons: list = None, exclude_small=True, min_mol_size: int = 6
+                    ) -> MoleculeContainer:
+    """
+    The function takes a list of retrons, excludes small retrons if specified, and composes them into a single molecule.
+    This molecule is used for the prediction of synthesisability of the characterizing the possible success of the path
+    including the nodes with the given retrons.
+    :param retrons: The list of retrons to be composed.
+    :type retrons: list
+    :param exclude_small: The parameter that determines whether small retrons should be
+    excluded from the composition process. If `exclude_small` is set to `True`, only retrons with a length greater than
+    min_mol_size will be considered for composition.
+    :param min_mol_size: parameter used with exclude_small
+    :return: A composed retrons as a MoleculeContainer object.
+    """
+    if len(retrons) == 1:
+        return retrons[0].molecule
+    elif len(retrons) > 1:
+        if exclude_small:
+            big_retrons = [
+                retron for retron in retrons if len(retron.molecule) > min_mol_size
+            ]
+            if big_retrons:
+                retrons = big_retrons
+        tmp_mol = retrons[0].molecule.copy()
+        transition_mapping = {}
+        for mol in retrons[1:]:
+            for n, atom in mol.molecule.atoms():
+                new_number = tmp_mol.add_atom(atom.atomic_symbol)
+                transition_mapping[n] = new_number
+            for atom, neighbor, bond in mol.molecule.bonds():
+                tmp_mol.add_bond(
+                    transition_mapping[atom], transition_mapping[neighbor], bond
+                )
+            transition_mapping = {}
+        return tmp_mol

SynTool/chem/utils.py ADDED Viewed

	@@ -0,0 +1,227 @@

+from typing import List, Iterable, Tuple, Union
+from CGRtools.containers import MoleculeContainer, ReactionContainer, QueryContainer
+from CGRtools.exceptions import InvalidAromaticRing
+def query_to_mol(query: QueryContainer) -> MoleculeContainer:
+    """
+    Converts a QueryContainer object into a MoleculeContainer object.
+    :param query: A QueryContainer object representing the query structure.
+    :return: A MoleculeContainer object that replicates the structure of the query.
+    """
+    new_mol = MoleculeContainer()
+    for n, atom in query.atoms():
+        new_mol.add_atom(atom.atomic_symbol, n, charge=atom.charge, is_radical=atom.is_radical)
+    for i, j, bond in query.bonds():
+        new_mol.add_bond(i, j, int(bond))
+    return new_mol
+def reaction_query_to_reaction(rule: ReactionContainer) -> ReactionContainer:
+    """
+    Converts a ReactionContainer object with query structures into a ReactionContainer with molecular structures.
+    :param rule: A ReactionContainer object where reactants and products are QueryContainer objects.
+    :return: A new ReactionContainer
+    :return: A new ReactionContainer object where reactants and products are MoleculeContainer objects.
+    """
+    reactants = [query_to_mol(q) for q in rule.reactants]
+    products = [query_to_mol(q) for q in rule.products]
+    reagents = [query_to_mol(q) for q in rule.reagents]  # Assuming reagents are also part of the rule
+    reaction = ReactionContainer(reactants, products, reagents, rule.meta)
+    reaction.name = rule.name
+    return reaction
+def unite_molecules(molecules: Iterable[MoleculeContainer]) -> MoleculeContainer:
+    """
+    Unites a list of MoleculeContainer objects into a single MoleculeContainer.
+    This function takes multiple molecules and combines them into one larger molecule.
+    The first molecule in the list is taken as the base, and subsequent molecules are united with it sequentially.
+    :param molecules: A list of MoleculeContainer objects to be united.
+    :return: A single MoleculeContainer object representing the union of all input molecules.
+    """
+    new_mol = MoleculeContainer()
+    for mol in molecules:
+        new_mol = new_mol.union(mol)
+    return new_mol
+def safe_canonicalization(molecule: MoleculeContainer):
+    """
+    Attempts to canonicalize a molecule, handling any exceptions.
+    This function tries to canonicalize the given molecule.
+    If the canonicalization process fails due to an InvalidAromaticRing exception,
+    it safely returns the original molecule.
+    :param molecule: The given molecule to be canonicalized.
+    :return: The canonicalized molecule if successful, otherwise the original molecule.
+    """
+    molecule._atoms = dict(sorted(molecule._atoms.items()))
+    tmp = molecule.copy()
+    try:
+        tmp.canonicalize()
+        return tmp
+    except InvalidAromaticRing:
+        return molecule
+def split_molecules(molecules: Iterable, number_of_atoms: int) -> Tuple[List, List]:
+    """
+    Splits molecules according to the number of heavy atoms.
+    :param molecules: Iterable of molecules.
+    :param number_of_atoms: Threshold for splitting molecules.
+    :return: Tuple of lists containing "big" molecules and "small" molecules.
+    """
+    big_molecules, small_molecules = [], []
+    for molecule in molecules:
+        if len(molecule) > number_of_atoms:
+            big_molecules.append(molecule)
+        else:
+            small_molecules.append(molecule)
+    return big_molecules, small_molecules
+def remove_small_molecules(
+        reaction: ReactionContainer,
+        number_of_atoms: int = 6,
+        small_molecules_to_meta: bool = True
+) -> Union[ReactionContainer, None]:
+    """
+    Processes a reaction by removing small molecules.
+    :param reaction: ReactionContainer object.
+    :param number_of_atoms: Molecules with the number of atoms equal to or below this will be removed.
+    :param small_molecules_to_meta: If True, deleted molecules are saved to meta.
+    :return: Processed ReactionContainer without small molecules.
+    """
+    new_reactants, small_reactants = split_molecules(reaction.reactants, number_of_atoms)
+    new_products, small_products = split_molecules(reaction.products, number_of_atoms)
+    if sum(len(mol) for mol in new_reactants) == 0 or sum(len(mol) for mol in new_reactants) == 0:
+        return None
+    new_reaction = ReactionContainer(new_reactants, new_products, reaction.reagents, reaction.meta)
+    new_reaction.name = reaction.name
+    if small_molecules_to_meta:
+        united_small_reactants = unite_molecules(small_reactants)
+        new_reaction.meta["small_reactants"] = str(united_small_reactants)
+        united_small_products = unite_molecules(small_products)
+        new_reaction.meta["small_products"] = str(united_small_products)
+    return new_reaction
+def rebalance_reaction(reaction: ReactionContainer) -> ReactionContainer:
+    """
+    Rebalances the reaction by assembling CGR and then decomposing it. Works for all reactions for which the correct
+    CGR can be assembled
+    :param reaction: a reaction object
+    :return: a rebalanced reaction
+    """
+    tmp_reaction = ReactionContainer(reaction.reactants, reaction.products)
+    cgr = ~tmp_reaction
+    reactants, products = ~cgr
+    rebalanced_reaction = ReactionContainer(reactants.split(), products.split(), reaction.reagents, reaction.meta)
+    rebalanced_reaction.name = reaction.name
+    return rebalanced_reaction
+def reverse_reaction(reaction: ReactionContainer) -> ReactionContainer:
+    """
+    Reverses given reaction
+    :param reaction: a reaction object
+    :return: the reversed reaction
+    """
+    reversed_reaction = ReactionContainer(reaction.products, reaction.reactants, reaction.reagents, reaction.meta)
+    reversed_reaction.name = reaction.name
+    return reversed_reaction
+def remove_reagents(
+        reaction: ReactionContainer,
+        keep_reagents: bool = True,
+        reagents_max_size: int = 7
+) -> Union[ReactionContainer, None]:
+    """
+    Removes reagents (not changed molecules or molecules not involved in the reaction) from reactants and products
+    :param reaction: a reaction object
+    :param keep_reagents: if True, the reagents are written to ReactionContainer
+    :param reagents_max_size: max size of molecules that are called reagents, bigger are deleted
+    :return: cleaned reaction
+    """
+    not_changed_molecules = set(reaction.reactants).intersection(reaction.products)
+    cgr = ~reaction
+    center_atoms = set(cgr.center_atoms)
+    new_reactants = []
+    new_products = []
+    new_reagents = []
+    for molecule in reaction.reactants:
+        if center_atoms.isdisjoint(molecule) or molecule in not_changed_molecules:
+            new_reagents.append(molecule)
+        else:
+            new_reactants.append(molecule)
+    for molecule in reaction.products:
+        if center_atoms.isdisjoint(molecule) or molecule in not_changed_molecules:
+            new_reagents.append(molecule)
+        else:
+            new_products.append(molecule)
+    if sum(len(mol) for mol in new_reactants) == 0 or sum(len(mol) for mol in new_reactants) == 0:
+        return None
+    if keep_reagents:
+        new_reagents = {molecule for molecule in new_reagents if len(molecule) <= reagents_max_size}
+    else:
+        new_reagents = []
+    new_reaction = ReactionContainer(new_reactants, new_products, new_reagents, reaction.meta)
+    new_reaction.name = reaction.name
+    return new_reaction
+def to_reaction_smiles_record(reaction):
+    if isinstance(reaction, str):
+        return reaction
+    reaction_record = [format(reaction, "m")]
+    sorted_meta = sorted(reaction.meta.items(), key=lambda x: x[0])
+    for _, meta_info in sorted_meta:
+        # meta_info = str(meta_info)
+        meta_info = ''  # TODO decide what to do with meta
+        meta_info = ";".join(meta_info.split("\n"))
+        reaction_record.append(str(meta_info))
+    # return "\t".join(reaction_record) + "\n"
+    return "".join(reaction_record)
+def cgr_from_rule(rule: ReactionContainer):
+    reaction_rule = reaction_query_to_reaction(rule)
+    cgr_rule = ~reaction_rule
+    return cgr_rule
+def hash_from_rule(reaction_rule: ReactionContainer):
+    reactants_hash = tuple(sorted(hash(r) for r in reaction_rule.reactants))
+    reagents_hash = tuple(sorted(hash(r) for r in reaction_rule.reagents))
+    products_hash = tuple(sorted(hash(r) for r in reaction_rule.products))
+    return hash((reactants_hash, reagents_hash, products_hash))

SynTool/interfaces/__init__.py ADDED Viewed

File without changes

SynTool/interfaces/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (155 Bytes). View file

SynTool/interfaces/__pycache__/visualisation.cpython-310.pyc ADDED Viewed

Binary file (11 kB). View file

SynTool/interfaces/cli.py ADDED Viewed

	@@ -0,0 +1,530 @@

+"""
+Module containing commands line scripts for training and planning mode
+"""
+import os
+import shutil
+import yaml
+import warnings
+from pathlib import Path
+import click
+import gdown
+from SynTool.chem.data.cleaning import reactions_cleaner
+from SynTool.chem.data.filtering import filter_reactions, ReactionCheckConfig
+from SynTool.utils.loading import standardize_building_blocks
+from SynTool.chem.reaction_rules.extraction import extract_rules_from_reactions
+from SynTool.mcts.search import tree_search
+from SynTool.ml.training.reinforcement import run_reinforcement_tuning
+from SynTool.ml.training.supervised import create_policy_dataset, run_policy_training
+from SynTool.utils.config import ReinforcementConfig, TreeConfig, PolicyNetworkConfig, ValueNetworkConfig
+from SynTool.utils.config import ReactionStandardizationConfig, RuleExtractionConfig
+from SynTool.chem.data.mapping import remove_reagents_and_map_from_file
+warnings.filterwarnings("ignore")
+@click.group(name="syntool")
+def syntool():
+    pass
+@syntool.command(name="download_planning_data")
+@click.option(
+    "--root_dir",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the reaction database file that will be mapped.",
+)
+def download_planning_data_cli(root_dir='.'):
+    """
+    Downloads data for retrosythesis planning
+    """
+    remote_id = "1ygq9BvQgH2Tq_rL72BvSOdASSSbPFTsL"
+    output = os.path.join(root_dir, "syntool_planning_data.zip")
+    #
+    gdown.download(output=output, id=remote_id, quiet=False)
+    shutil.unpack_archive(output, root_dir)
+    #
+    os.remove(output)
+@syntool.command(name='download_training_data')
+@click.option(
+    "--root_dir",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the reaction database file that will be mapped.",
+)
+def download_training_data_cli(root_dir='.'):
+    """
+    Downloads data for retrosythetic models training
+    """
+    remote_id = "1ckhO1l6xud0_bnC0rCDMkIlKRUMG_xs8"
+    output = os.path.join(root_dir, "syntool_training_data.zip")
+    #
+    gdown.download(output=output, id=remote_id, quiet=False)
+    shutil.unpack_archive(output, root_dir)
+    #
+    os.remove(output)
+@syntool.command(name="building_blocks")
+@click.option(
+    "--input",
+    "input_file",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the reaction database file that will be mapped.",
+)
+@click.option(
+    "--output",
+    "output_file",
+    required=True,
+    type=click.Path(),
+    help="File where the results will be stored.",
+)
+def building_blocks_cli(input_file, output_file):
+    """
+    Standardizes building blocks
+    """
+    standardize_building_blocks(input_file=input_file, output_file=output_file)
+@syntool.command(name="reaction_mapping")
+@click.option(
+    "--config",
+    "config_path",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the configuration file. This file contains settings for mapping and standardizing reactions.",
+)
+@click.option(
+    "--input",
+    "input_file",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the reaction database file that will be mapped.",
+)
+@click.option(
+    "--output",
+    "output_file",
+    default=Path("reaction_data_standardized.smi"),
+    type=click.Path(),
+    help="File where the results will be stored.",
+)
+def reaction_mapping_cli(config_path, input_file, output_file):
+    """
+    Reaction data mapping
+    """
+    stand_config = ReactionStandardizationConfig.from_yaml(config_path)
+    remove_reagents_and_map_from_file(input_file=input_file, output_file=output_file, keep_reagent=stand_config.keep_reagents)
+@syntool.command(name="reaction_standardizing")
+@click.option(
+    "--config",
+    "config_path",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the configuration file. This file contains settings for mapping and standardizing reactions.",
+)
+@click.option(
+    "--input",
+    "input_file",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the reaction database file that will be mapped.",
+)
+@click.option(
+    "--output",
+    "output_file",
+    type=click.Path(),
+    help="File where the results will be stored.",
+)
+@click.option(
+    "--num_cpus",
+    default=8,
+    type=int,
+    help="Number of CPUs to use for processing. Defaults to 1.",
+)
+def reaction_standardizing_cli(config_path, input_file, output_file, num_cpus):
+    """
+    Standardizes reactions and remove duplicates
+    """
+    stand_config = ReactionStandardizationConfig.from_yaml(config_path)
+    reactions_cleaner(config=stand_config,
+                      input_file=input_file,
+                      output_file=output_file,
+                      num_cpus=num_cpus)
+@syntool.command(name="reaction_filtering")
+@click.option(
+    "--config",
+    "config_path",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the configuration file. This file contains settings for filtering reactions.",
+)
+@click.option(
+    "--input",
+    "input_file",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the reaction database file that will be mapped.",
+)
+@click.option(
+    "--output",
+    "output_file",
+    default=Path("./"),
+    type=click.Path(),
+    help="File where the results will be stored.",
+)
+@click.option(
+    "--append_results",
+    is_flag=True,
+    default=False,
+    help="If set, results will be appended to existing files. By default, new files are created.",
+)
+@click.option(
+    "--batch_size",
+    default=100,
+    type=int,
+    help="Size of the batch for processing reactions. Defaults to 10.",
+)
+@click.option(
+    "--num_cpus",
+    default=8,
+    type=int,
+    help="Number of CPUs to use for processing. Defaults to 1.",
+)
+def reaction_filtering_cli(config_path,
+                           input_file,
+                           output_file,
+                           append_results,
+                           batch_size,
+                           num_cpus):
+    """
+    Filters erroneous reactions
+    """
+    reaction_check_config = ReactionCheckConfig().from_yaml(config_path)
+    filter_reactions(
+        config=reaction_check_config,
+        reaction_database_path=input_file,
+        result_reactions_file_name=output_file,
+        append_results=append_results,
+        num_cpus=num_cpus,
+        batch_size=batch_size,
+    )
+@syntool.command(name="rule_extracting")
+@click.option(
+    "--config",
+    "config_path",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the configuration file. This file contains settings for reaction rules extraction.",
+)
+@click.option(
+    "--input",
+    "input_file",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the reaction database file that will be mapped.",
+)
+@click.option(
+    "--output",
+    "output_file",
+    required=True,
+    type=click.Path(),
+    help="File where the results will be stored.",
+)
+@click.option(
+    "--batch_size",
+    default=100,
+    type=int,
+    help="Size of the batch for processing reactions. Defaults to 10.",
+)
+@click.option(
+    "--num_cpus",
+    default=4,
+    type=int,
+    help="Number of CPUs to use for processing. Defaults to 4.",
+)
+def rule_extracting_cli(
+    config_path,
+    input_file,
+    output_file,
+    num_cpus,
+    batch_size,
+):
+    """
+    Extracts reaction rules
+    """
+    reaction_rule_config = RuleExtractionConfig.from_yaml(config_path)
+    extract_rules_from_reactions(config=reaction_rule_config,
+                                 reaction_file=input_file,
+                                 rules_file_name=output_file,
+                                 num_cpus=num_cpus,
+                                 batch_size=batch_size)
+@syntool.command(name="supervised_ranking_policy_training")
+@click.option(
+    "--config",
+    "config_path",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the configuration file. This file contains settings for policy training.",
+)
+@click.option(
+    "--reaction_data",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the reaction database file that will be mapped.",
+)
+@click.option(
+    "--reaction_rules",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the reaction database file that will be mapped.",
+)
+@click.option(
+    "--results_dir",
+    default=Path("."),
+    type=click.Path(),
+    help="Root directory where the results will be stored.",
+)
+@click.option(
+    "--num_cpus",
+    default=4,
+    type=int,
+    help="Number of CPUs to use for processing. Defaults to 4.",
+)
+def supervised_ranking_policy_training_cli(config_path, reaction_data, reaction_rules, results_dir, num_cpus):
+    """
+    Trains ranking policy network
+    """
+    policy_config = PolicyNetworkConfig.from_yaml(config_path)
+    policy_dataset_file = os.path.join(results_dir, 'policy_dataset.dt')
+    datamodule = create_policy_dataset(reaction_rules_path=reaction_rules,
+                                       molecules_or_reactions_path=reaction_data,
+                                       output_path=policy_dataset_file,
+                                       dataset_type='ranking',
+                                       batch_size=policy_config.batch_size,
+                                       num_cpus=num_cpus)
+    run_policy_training(datamodule, config=policy_config, results_path=results_dir)
+@syntool.command(name="supervised_filtering_policy_training")
+@click.option(
+    "--config",
+    "config_path",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the configuration file. This file contains settings for policy training.",
+)
+@click.option(
+    "--molecules_file",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the molecules database file that will be mapped.",
+)
+@click.option(
+    "--reaction_rules",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the reaction database file that will be mapped.",
+)
+@click.option(
+    "--results_dir",
+    default=Path("."),
+    type=click.Path(),
+    help="Root directory where the results will be stored.",
+)
+@click.option(
+    "--num_cpus",
+    default=8,
+    type=int,
+    help="Number of CPUs to use for processing. Defaults to 1.",
+)
+def supervised_filtering_policy_training_cli(config_path, molecules_file, reaction_rules, results_dir, num_cpus):
+    """
+    Trains filtering policy network
+    """
+    policy_config = PolicyNetworkConfig.from_yaml(config_path)
+    policy_dataset_file = os.path.join(results_dir, 'policy_dataset.ckpt')
+    datamodule = create_policy_dataset(reaction_rules_path=reaction_rules,
+                                       molecules_or_reactions_path=molecules_file,
+                                       output_path=policy_dataset_file,
+                                       dataset_type='filtering',
+                                       batch_size=policy_config.batch_size,
+                                       num_cpus=num_cpus)
+    run_policy_training(datamodule, config=policy_config, results_path=results_dir)
+@syntool.command(name="reinforcement_value_network_training")
+@click.option(
+    "--config",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the configuration file. This file contains settings for policy training.",
+)
+@click.option(
+    "--targets",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the configuration file. This file contains settings for policy training.",
+)
+@click.option(
+    "--reaction_rules",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the configuration file. This file contains settings for policy training.",
+)
+@click.option(
+    "--building_blocks",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the configuration file. This file contains settings for policy training.",
+)
+@click.option(
+    "--policy_network",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the configuration file. This file contains settings for policy training.",
+)
+@click.option(
+    "--value_network",
+    default=None,
+    type=click.Path(exists=True),
+    help="Path to the configuration file. This file contains settings for policy training.",
+)
+@click.option(
+    "--results_dir",
+    default='.',
+    type=click.Path(exists=False),
+    help="Path to the configuration file. This file contains settings for policy training.",
+)
+def reinforcement_value_network_training_cli(config,
+                                             targets,
+                                             reaction_rules,
+                                             building_blocks,
+                                             policy_network,
+                                             value_network,
+                                             results_dir):
+    """
+    Trains value network with reinforcement learning
+    """
+    with open(config, "r") as file:
+        config = yaml.safe_load(file)
+    policy_config = PolicyNetworkConfig.from_dict(config['node_expansion'])
+    policy_config.weights_path = policy_network
+    value_config = ValueNetworkConfig.from_dict(config['value_network'])
+    if value_network is None:
+        value_config.weights_path = os.path.join(results_dir, 'weights', 'value_network.ckpt')
+    tree_config = TreeConfig.from_dict(config['tree'])
+    reinforce_config = ReinforcementConfig.from_dict(config['reinforcement'])
+    run_reinforcement_tuning(targets_path=targets,
+                             tree_config=tree_config,
+                             policy_config=policy_config,
+                             value_config=value_config,
+                             reinforce_config=reinforce_config,
+                             reaction_rules_path=reaction_rules,
+                             building_blocks_path=building_blocks,
+                             results_root=results_dir)
+@syntool.command(name="planning")
+@click.option(
+    "--config",
+    "config_path",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the configuration file. This file contains settings for policy training.",
+)
+@click.option(
+    "--targets",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the configuration file. This file contains settings for policy training.",
+)
+@click.option(
+    "--reaction_rules",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the configuration file. This file contains settings for policy training.",
+)
+@click.option(
+    "--building_blocks",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the configuration file. This file contains settings for policy training.",
+)
+@click.option(
+    "--policy_network",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to the configuration file. This file contains settings for policy training.",
+)
+@click.option(
+    "--value_network",
+    default=None,
+    type=click.Path(exists=True),
+    help="Path to the configuration file. This file contains settings for policy training.",
+)
+@click.option(
+    "--results_dir",
+    default='.',
+    type=click.Path(exists=False),
+    help="Path to the configuration file. This file contains settings for policy training.",
+)
+def planning_cli(config_path,
+                 targets,
+                 reaction_rules,
+                 building_blocks,
+                 policy_network,
+                 value_network,
+                 results_dir):
+    """
+    Runs retrosynthesis planning
+    """
+    with open(config_path, "r") as file:
+        config = yaml.safe_load(file)
+    tree_config = TreeConfig.from_dict({**config['tree'], **config['node_evaluation']})
+    policy_config = PolicyNetworkConfig.from_dict({**config['node_expansion'], **{'weights_path': policy_network}})
+    tree_search(targets_path=targets,
+                tree_config=tree_config,
+                policy_config=policy_config,
+                reaction_rules_path=reaction_rules,
+                building_blocks_path=building_blocks,
+                value_weights_path=value_network,
+                results_root=results_dir)
+if __name__ == '__main__':
+    syntool()

SynTool/interfaces/cli.py.bk ADDED Viewed

	@@ -0,0 +1,241 @@

+"""
+Module containing commands line scripts for training and planning mode
+"""
+import warnings
+import os
+import shutil
+from pathlib import Path
+import click
+import gdown
+from datetime import datetime
+from Syntool.chem.reaction_rules.extraction import extract_rules_from_reactions
+from Syntool.chem.data.cleaning import reactions_cleaner
+from Syntool.chem.data.mapping import remove_reagents_and_map_from_file
+from Syntool.chem.loading import standardize_building_blocks
+from Syntool.ml.training import create_policy_dataset, run_policy_training
+from Syntool.ml.training.reinforcement import run_self_tuning
+from Syntool.ml.networks.policy import PolicyNetworkConfig
+from Syntool.utils.config import read_planning_config, read_training_config, TreeConfig
+from Syntool.mcts.search import tree_search
+from Syntool.chem.data.filtering import (
+    filter_reactions,
+    ReactionCheckConfig,
+    CCRingBreakingConfig,
+    WrongCHBreakingConfig,
+    CCsp3BreakingConfig,
+    DynamicBondsConfig,
+    MultiCenterConfig,
+    NoReactionConfig,
+    SmallMoleculesConfig,
+)
+warnings.filterwarnings("ignore")
+main = click.Group()
+@main.command(name='planning_data')
+def planning_data_cli():
+    """
+    Downloads a file from Google Drive using its remote ID, saves it as a zip file, extracts the contents,
+    and then deletes the zip file
+    """
+    remote_id = '1c5YJDT-rP1ZvFA-ELmPNTUj0b8an4yFf'
+    output = 'synto_planning_data.zip'
+    #
+    gdown.download(output=output, id=remote_id, quiet=True)
+    shutil.unpack_archive(output, './')
+    #
+    os.remove(output)
+@main.command(name='training_data')
+def training_data_cli():
+    """
+    Downloads a file from Google Drive using its remote ID, saves it as a zip file, extracts the contents,
+    and then deletes the zip file
+    """
+    remote_id = '1r4I7OskGvzg-zxYNJ7WVYpVR2HSYW10N'
+    output = 'synto_training_data.zip'
+    #
+    gdown.download(output=output, id=remote_id, quiet=True)
+    shutil.unpack_archive(output, './')
+    #
+    os.remove(output)
+@main.command(name='syntool_planning')
+@click.option("--config",
+              "config_path",
+              required=True,
+              help="Path to the config YAML molecules_path.",
+              type=click.Path(exists=True, path_type=Path),
+              )
+def syntool_planning_cli(config_path):
+    """
+    Launches tree search for the given target molecules and stores search statistics and found retrosynthetic paths
+    :param config_path: The path to the configuration file that contains the settings and parameters for the tree search
+    """
+    config = read_planning_config(config_path)
+    config['Tree']['silent'] = True
+    # standardize building blocks
+    if config['InputData']['standardize_building_blocks']:
+        print('STANDARDIZE BUILDING BLOCKS ...')
+        standardize_building_blocks(config['InputData']['building_blocks_path'],
+                                    config['InputData']['building_blocks_path'])
+    # run planning
+    print('\nRUN PLANNING ...')
+    tree_config = TreeConfig.from_dict(config['Tree'])
+    tree_search(targets=config['General']['targets_path'],
+                tree_config=tree_config,
+                reaction_rules_path=config['InputData']['reaction_rules_path'],
+                building_blocks_path=config['InputData']['building_blocks_path'],
+                policy_weights_path=config['PolicyNetwork']['weights_path'],
+                value_weights_paths=config['ValueNetwork']['weights_path'],
+                results_root=config['General']['results_root'])
+@main.command(name='syntool_training')
+@click.option(
+    "--config",
+    "config_path",
+    required=True,
+    help="Path to the config YAML file.",
+    type=click.Path(exists=True, path_type=Path)
+             )
+def syntool_training_cli(config_path):
+    # read training config
+    print('READ CONFIG ...')
+    config = read_training_config(config_path)
+    print('Config is read')
+    reaction_data_file = config['InputData']['reaction_data_path']
+    # reaction data mapping
+    startTime0 = datetime.now()
+    data_output_folder = os.path.join(config['General']['results_root'], 'reaction_data')
+    Path(data_output_folder).mkdir(parents=True, exist_ok=True)
+    mapped_data_file = os.path.join(data_output_folder, 'reaction_data_mapped.smi')
+    if config['DataCleaning']['map_reactions']:
+        print('\nMAP REACTION DATA ...')
+        remove_reagents_and_map_from_file(input_file=config['InputData']['reaction_data_path'],
+                                          output_file=mapped_data_file)
+        reaction_data_file = mapped_data_file
+        print("remove_reagents_and_map_from_file:", datetime.now() - startTime0)
+    # reaction data cleaning
+    startTime0 = datetime.now()
+    cleaned_data_file = os.path.join(data_output_folder, 'reaction_data_cleaned.rdf')
+    if config['DataCleaning']['clean_reactions']:
+        print('\nCLEAN REACTION DATA ...')
+        reactions_cleaner(input_file=reaction_data_file,
+                          output_file=cleaned_data_file,
+                          num_cpus=config['General']['num_cpus'])
+        reaction_data_file = cleaned_data_file
+        print("reactions_cleaner:", datetime.now() - startTime0)
+    # reactions data filtering
+    startTime0 = datetime.now()
+    if config['DataCleaning']['filter_reactions']:
+        print('\nFILTER REACTION DATA ...')
+        #
+        filtration_config = ReactionCheckConfig(
+            remove_small_molecules=False,
+            small_molecules_config=SmallMoleculesConfig(limit=6),
+            dynamic_bonds_config=DynamicBondsConfig(min_bonds_number=1, max_bonds_number=6),
+            no_reaction_config=NoReactionConfig(),
+            multi_center_config=MultiCenterConfig(),
+            wrong_ch_breaking_config=WrongCHBreakingConfig(),
+            cc_sp3_breaking_config=CCsp3BreakingConfig(),
+            cc_ring_breaking_config=CCRingBreakingConfig()
+        )
+        filtered_data_file = os.path.join(data_output_folder, 'reaction_data_filtered.rdf')
+        filter_reactions(config=filtration_config,
+                         reaction_database_path=reaction_data_file,
+                         result_directory_path=data_output_folder,
+                         result_reactions_file_name='reaction_data_filtered',
+                         num_cpus=config['General']['num_cpus'],
+                         batch_size=100)
+        reaction_data_file = filtered_data_file
+        print("filter_reactions:", datetime.now() - startTime0)
+    # standardize building blocks
+    startTime0 = datetime.now()
+    if config['DataCleaning']['standardize_building_blocks']:
+        print('\nSTANDARDIZE BUILDING BLOCKS ...')
+        standardize_building_blocks(config['InputData']['building_blocks_path'],
+                                    config['InputData']['building_blocks_path'])
+        print("standardize_building_blocks:", datetime.now() - startTime0)
+    # reaction rules extraction
+    startTime0 = datetime.now()
+    print('\nEXTRACT REACTION RULES ...')
+    rules_output_folder = os.path.join(config['General']['results_root'], 'reaction_rules')
+    Path(rules_output_folder).mkdir(parents=True, exist_ok=True)
+    reaction_rules_path = os.path.join(rules_output_folder, 'reaction_rules_filtered.pickle')
+    config['InputData']['reaction_rules_path'] = reaction_rules_path
+    extract_rules_from_reactions(config=config,
+                                 reaction_file=reaction_data_file,
+                                 results_root=rules_output_folder,
+                                 num_cpus=config['General']['num_cpus'])
+    print("extract_rules_from_reactions:", datetime.now() - startTime0)
+    # create policy network dataset
+    startTime0 = datetime.now()
+    print('\nCREATE POLICY NETWORK DATASET ...')
+    policy_output_folder = os.path.join(config['General']['results_root'], 'policy_network')
+    Path(policy_output_folder).mkdir(parents=True, exist_ok=True)
+    policy_data_file = os.path.join(policy_output_folder, 'policy_dataset.pt')
+    if config['PolicyNetwork']['policy_type'] == 'ranking':
+        molecules_or_reactions_path = reaction_data_file
+    elif config['PolicyNetwork']['policy_type'] == 'filtering':
+        molecules_or_reactions_path = config['InputData']['policy_data_path']
+    else:
+        raise ValueError(
+            "Invalid policy_type. Allowed values are 'ranking', 'filtering'."
+        )
+    datamodule = create_policy_dataset(reaction_rules_path=reaction_rules_path,
+                                       molecules_or_reactions_path=molecules_or_reactions_path,
+                                       output_path=policy_data_file,
+                                       dataset_type=config['PolicyNetwork']['policy_type'],
+                                       batch_size=config['PolicyNetwork']['batch_size'],
+                                       num_cpus=config['General']['num_cpus'])
+    print("datamodule:", datetime.now() - startTime0)
+    # train policy network
+    startTime0 = datetime.now()
+    print('\nTRAIN POLICY NETWORK ...')
+    policy_config = PolicyNetworkConfig.from_dict(config['PolicyNetwork'])
+    run_policy_training(datamodule, config=policy_config, results_path=policy_output_folder)
+    config['PolicyNetwork']['weights_path'] = os.path.join(policy_output_folder, 'weights', 'policy_network.ckpt')
+    print("run_policy_training:", datetime.now() - startTime0)
+    # self-tuning value network training
+    startTime0 = datetime.now()
+    print('\nTRAIN VALUE NETWORK ...')
+    value_output_folder = os.path.join(config['General']['results_root'], 'value_network')
+    Path(value_output_folder).mkdir(parents=True, exist_ok=True)
+    config['ValueNetwork']['weights_path'] = os.path.join(value_output_folder, 'weights', 'value_network.ckpt')
+    run_self_tuning(config, results_root=value_output_folder)
+    print("run_self_tuning:", datetime.now() - startTime0)
+if __name__ == '__main__':
+    main()

SynTool/interfaces/visualisation.py ADDED Viewed

	@@ -0,0 +1,346 @@

+"""
+Module containing functions for analysis and visualization of the built search tree
+"""
+from itertools import count, islice
+from CGRtools.containers import MoleculeContainer
+from SynTool import Tree
+from SynTool.utils import path_type
+def get_child_nodes(tree, molecule, graph):
+    nodes = []
+    try:
+        graph[molecule]
+    except KeyError:
+        return []
+    for retron in graph[molecule]:
+        temp_obj = {
+            "smiles": str(retron),
+            "type": "mol",
+            "in_stock": str(retron) in tree.building_blocks,
+        }
+        node = get_child_nodes(tree, retron, graph)
+        if node:
+            temp_obj["children"] = [node]
+        nodes.append(temp_obj)
+    return {"type": "reaction", "children": nodes}
+def extract_routes(tree, extended=False):
+    """
+    The function takes the target and the dictionary of
+    successors and predecessors and returns a list of dictionaries that contain the target
+    and the list of successors
+    :return: A list of dictionaries. Each dictionary contains a target, a list of children, and a
+    boolean indicating whether the target is in building_blocks.
+    """
+    target = tree.nodes[1].retrons_to_expand[0].molecule
+    target_in_stock = tree.nodes[1].curr_retron.is_building_block(tree.building_blocks)
+    # Append encoded routes to list
+    paths_block = []
+    winning_nodes = []
+    if extended:
+        # Gather paths
+        for i, node in tree.nodes.items():
+            if node.is_solved():
+                winning_nodes.append(i)
+    else:
+        winning_nodes = tree.winning_nodes
+    if winning_nodes:
+        for winning_node in winning_nodes:
+            # Create graph for route
+            nodes = tree.path_to_node(winning_node)
+            graph, pred = {}, {}
+            for before, after in zip(nodes, nodes[1:]):
+                before = before.curr_retron.molecule
+                graph[before] = after = [x.molecule for x in after.new_retrons]
+                for x in after:
+                    pred[x] = before
+            paths_block.append({"type": "mol", "smiles": str(target),
+                                "in_stock": target_in_stock,
+                                "children": [get_child_nodes(tree, target, graph)]})
+    else:
+        paths_block = [{"type": "mol", "smiles": str(target), "in_stock": target_in_stock, "children": []}]
+    return paths_block
+def path_graph(tree, node: int) -> str:
+    """
+    Visualizes reaction path
+    :param node: node id
+    :type node: int
+    :return: The SVG string.
+    """
+    nodes = tree.path_to_node(node)
+    # Set up node_id types for different box colors
+    for node in nodes:
+        for retron in node.new_retrons:
+            retron._molecule.meta["status"] = "instock" if retron.is_building_block(
+                tree.building_blocks) else "mulecule"
+    nodes[0].curr_retron._molecule.meta["status"] = "target"
+    # Box colors
+    box_colors = {"target": "#98EEFF",  # 152, 238, 255
+        "mulecule": "#F0AB90",  # 240, 171, 144
+        "instock": "#9BFAB3",  # 155, 250, 179
+    }
+    # first column is target
+    # second column are first new retrons_to_expand
+    columns = [[nodes[0].curr_retron.molecule], [x.molecule for x in nodes[1].new_retrons], ]
+    pred = {x: 0 for x in range(1, len(columns[1]) + 1)}
+    cx = [n for n, x in enumerate(nodes[1].new_retrons, 1) if not x.is_building_block(tree.building_blocks)]
+    size = len(cx)
+    nodes = iter(nodes[2:])
+    cy = count(len(columns[1]) + 1)
+    while size:
+        layer = []
+        for s in islice(nodes, size):
+            n = cx.pop(0)
+            for x in s.new_retrons:
+                layer.append(x)
+                m = next(cy)
+                if not x.is_building_block(tree.building_blocks):
+                    cx.append(m)
+                pred[m] = n
+        size = len(cx)
+        columns.append([x.molecule for x in layer])
+    columns = [columns[::-1] for columns in columns[::-1]]  # Reverse array to make retrosynthetic graph
+    pred = tuple(  # Change dict to tuple to make multiple retrons_to_expand available
+        (abs(source - len(pred)), abs(target - len(pred))) for target, source in pred.items())
+    # now we have columns for visualizing
+    # lets start recalculate XY
+    x_shift = 0.0
+    c_max_x = 0.0
+    c_max_y = 0.0
+    render = []
+    cx = count()
+    cy = count()
+    arrow_points = {}
+    for ms in columns:
+        heights = []
+        for m in ms:
+            m.clean2d()
+            # X-shift for target
+            min_x = min(x for x, y in m._plane.values()) - x_shift
+            min_y = min(y for x, y in m._plane.values())
+            m._plane = {n: (x - min_x, y - min_y) for n, (x, y) in m._plane.items()}
+            max_x = max(x for x, y in m._plane.values())
+            if max_x > c_max_x:
+                c_max_x = max_x
+            arrow_points[next(cx)] = [x_shift, max_x]
+            heights.append(max(y for x, y in m._plane.values()))
+        x_shift = c_max_x + 5.0  # between columns gap
+        # calculate Y-shift
+        y_shift = sum(heights) + 3.0 * (len(heights) - 1)
+        if y_shift > c_max_y:
+            c_max_y = y_shift
+        y_shift /= 2.0
+        for m, h in zip(ms, heights):
+            m._plane = {n: (x, y - y_shift) for n, (x, y) in m._plane.items()}
+            # Calculate coordinates for boxes
+            max_x = max(x for x, y in m._plane.values()) + 0.9  # Max x
+            min_x = min(x for x, y in m._plane.values()) - 0.6  # Min x
+            max_y = -(max(y for x, y in m._plane.values()) + 0.45)  # Max y
+            min_y = -(min(y for x, y in m._plane.values()) - 0.45)  # Min y
+            x_delta = abs(max_x - min_x)
+            y_delta = abs(max_y - min_y)
+            box = (
+                f'<rect x="{min_x}" y="{max_y}" rx="{y_delta * 0.1}" ry="{y_delta * 0.1}" width="{x_delta}" height="{y_delta}"'
+                f' stroke="black" stroke-width=".0025" fill="{box_colors[m.meta["status"]]}" fill-opacity="0.30"/>')
+            arrow_points[next(cy)].append(y_shift - h / 2.0)
+            y_shift -= h + 3.0
+            depicted_molecule = list(m.depict(embedding=True))[:3]
+            depicted_molecule.append(box)
+            render.append(depicted_molecule)
+    # Calculate mid-X coordinate to draw square arrows
+    graph = {}
+    for s, p in pred:
+        try:
+            graph[s].append(p)
+        except KeyError:
+            graph[s] = [p]
+    for s, ps in graph.items():
+        mid_x = float("-inf")
+        for p in ps:
+            s_min_x, s_max, s_y = arrow_points[s][:3]  # s
+            p_min_x, p_max, p_y = arrow_points[p][:3]  # p
+            p_max += 1
+            mid = p_max + (s_min_x - p_max) / 3
+            if mid > mid_x:
+                mid_x = mid
+        for p in ps:
+            arrow_points[p].append(mid_x)
+    config = MoleculeContainer._render_config
+    font_size = config["font_size"]
+    font125 = 1.25 * font_size
+    width = c_max_x + 4.0 * font_size  # 3.0 by default
+    height = c_max_y + 3.5 * font_size  # 2.5 by default
+    box_y = height / 2.0
+    svg = [f'<svg width="{0.6 * width:.2f}cm" height="{0.6 * height:.2f}cm" '
+           f'viewBox="{-font125:.2f} {-box_y:.2f} {width:.2f} '
+           f'{height:.2f}" xmlns="http://www.w3.org/2000/svg" version="1.1">',
+        '  <defs>\n    <marker id="arrow" markerWidth="10" markerHeight="10" '
+        'refX="0" refY="3" orient="auto">\n      <path d="M0,0 L0,6 L9,3"/>\n    </marker>\n  </defs>', ]
+    for s, p in pred:
+        """
+        (x1, y1) = (p_max, p_y)
+        (x2, y2) = (s_min_x, s_y)
+        polyline: (x1 y1, x2 y2, x3 y3, ..., xN yN)
+        """
+        s_min_x, s_max, s_y = arrow_points[s][:3]
+        p_min_x, p_max, p_y = arrow_points[p][:3]
+        p_max += 1
+        mid_x = arrow_points[p][-1]  # p_max + (s_min_x - p_max) / 3
+        """print(f"s_min_x: {s_min_x}, s_max: {s_max}, s_y: {s_y}")
+        print(f"p_min_x: {p_min_x}, p_max: {p_max}, p_y: {p_y}")
+        print(f"mid_x: {mid_x}\n")"""
+        arrow = f"""  <polyline points="{p_max:.2f} {p_y:.2f}, {mid_x:.2f} {p_y:.2f}, {mid_x:.2f} {s_y:.2f}, {s_min_x - 1.:.2f} {s_y:.2f}"
+                fill="none" stroke="black" stroke-width=".04" marker-end="url(#arrow)"/>"""
+        if p_y != s_y:
+            arrow += f'  <circle cx="{mid_x}" cy="{p_y}" r="0.1"/>'
+        svg.append(arrow)
+    for atoms, bonds, masks, box in render:
+        molecule_svg = MoleculeContainer._graph_svg(atoms, bonds, masks, -font125, -box_y, width, height)
+        molecule_svg.insert(1, box)
+        svg.extend(molecule_svg)
+    svg.append("</svg>")
+    return "\n".join(svg)
+def to_table(tree: Tree, html_path: path_type, aam: bool = False, extended=False, integration: bool = False):
+    """
+    Write an HTML page with the synthesis paths in SVG format and corresponding reactions in SMILES format
+    :param tree:  # TODO
+    :param extended:  # TODO
+    :param html_path: Path to save the HTML molecules_path, if None returns the html without saving it
+    :type html_path: str (optional)
+    :param aam: depict atom-to-atom mapping
+    :type aam: bool (optional)
+    :param integration: Whenever to output the full html file (False) or only the body (True)
+    :type integration: bool
+    """
+    if aam:
+        MoleculeContainer.depict_settings(aam=True)
+    else:
+        MoleculeContainer.depict_settings(aam=False)
+    paths = []
+    if extended:
+        # Gather paths
+        for idx, node in tree.nodes.items():
+            if node.is_solved():
+                paths.append(idx)
+    else:
+        paths = tree.winning_nodes
+    # HTML Tags
+    th = '<th style="text-align: left; background-color:#978785; border: 1px solid black; border-spacing: 0">'
+    td = '<td style="text-align: left; border: 1px solid black; border-spacing: 0">'
+    font_red = "<font color='red' style='font-weight: bold'>"
+    font_green = "<font color='light-green' style='font-weight: bold'>"
+    font_head = "<font style='font-weight: bold; font-size: 18px'>"
+    font_normal = "<font style='font-weight: normal; font-size: 18px'>"
+    font_close = "</font>"
+    template_begin = """
+    <!doctype html>
+    <html lang="en">
+    <head>
+    <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css"
+    rel="stylesheet"
+    integrity="sha384-1BmE4kWBq78iYhFldvKuhfTAU6auU8tT94WrHftjDbrCEXSU1oBoqyl2QvZ6jIW3"
+    crossorigin="anonymous">
+    <script
+    src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js"
+    integrity="sha384-ka7Sk0Gln4gmtz2MlQnikT1wXgYsOg+OMhuP+IlRH9sENBO0LRn5q+8nbTov4+1p"
+    crossorigin="anonymous">
+    </script>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <title>Predicted Paths Report</title>
+    <meta name="description" content="A simple HTML5 Template for new projects.">
+    <meta name="author" content="SitePoint">
+    </head>
+    <body>
+    """
+    template_end = """
+    </body>
+    </html>
+    """
+    # SVG Template
+    # box_mark = """
+    # <svg width="30" height="30" viewBox="0 0 1 1" xmlns="http://www.w3.org/2000/svg">
+    # <circle cx="0.5" cy="0.5" r="0.5" fill="rgb()" fill-opacity="0.35" />
+    # </svg>
+    # """
+    # table = f"<table><thead><{th}>Retrosynthetic Routes</th></thead><tbody>"
+    table = """<table class="table table-striped table-hover caption-top">"""
+    if not integration:
+        table += "<caption><h3>Retrosynthetic Routes Report</h3></caption><tbody>"
+    else:
+        table += "<tbody>"
+    # Gather path data
+    table += f"<tr>{td}{font_normal}Target Molecule: {str(tree.nodes[1].curr_retron)}{font_close}</td></tr>"
+    table += (f"<tr>{td}{font_normal}Tree Size: {len(tree)}{font_close} nodes</td></tr>")
+    table += f"<tr>{td}{font_normal}Number of visited nodes: {len(tree.visited_nodes)}{font_close}</td></tr>"
+    table += f"<tr>{td}{font_normal}Found paths: {len(paths)}{font_close}</td></tr>"
+    table += f"<tr>{td}{font_normal}Time: {round(tree.curr_time, 4)}{font_close} seconds</td></tr>"
+    table += f"""\
+    <tr>{td} \
+            <svg width="30" height="30" viewBox="0 0 1 1" xmlns="http://www.w3.org/2000/svg"> \
+            <circle cx="0.5" cy="0.5" r="0.5" fill="rgb(152, 238, 255)" fill-opacity="0.35" /></svg> \
+            Target Molecule \
+             \
+            <svg width="30" height="30" viewBox="0 0 1 1" xmlns="http://www.w3.org/2000/svg"> \
+            <circle cx="0.5" cy="0.5" r="0.5" fill="rgb(240, 171, 144)" fill-opacity="0.35" /></svg> \
+            Molecule Not In Stock \
+             \
+            <svg width="30" height="30" viewBox="0 0 1 1" xmlns="http://www.w3.org/2000/svg"> \
+            <circle cx="0.5" cy="0.5" r="0.5" fill="rgb(155, 250, 179)" fill-opacity="0.35" /></svg> \
+            Molecule In Stock \
+             \
+    </td></tr> \
+    """
+    for path in paths:
+        svg = path_graph(tree, path)  # Get SVG
+        full_path = tree.synthesis_path(path)  # Get Path
+        # Write SMILES of all reactions in synthesis path
+        step = 1
+        reactions = ""
+        for synth_step in full_path:
+            reactions += f"<b>Step {step}:</b> {str(synth_step)}<br>"
+            step += 1
+        # Concatenate all content of path
+        path_score = round(tree.path_score(path), 3)
+        table += (f'<tr style="line-height: 250%">{td}{font_head}Path {path}; '
+                  f"Steps: {len(full_path)}; "
+                  f"Cumulated nodes' value: {path_score}{font_close}</td></tr>")
+        # f"Cumulated nodes' value: {node._probabilities[path]}{font_close}</td></tr>"
+        table += f"<tr>{td}{svg}</td></tr>"
+        table += f"<tr>{td}{reactions}</td></tr>"
+    table += "</tbody>"
+    # Save or display output
+    if not html_path:
+        return table if integration else template_begin + table + template_end
+    output = html_path
+    with open(output, "w") as html_file:
+        html_file.write(template_begin)
+        html_file.write(table)
+        html_file.write(template_end)

SynTool/mcts/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .node import *
+from .tree import *
+from CGRtools.containers import MoleculeContainer
+MoleculeContainer.depict_settings(aam=False)
+__all__ = ["Tree", "Node"]

SynTool/mcts/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (322 Bytes). View file

SynTool/mcts/__pycache__/evaluation.cpython-310.pyc ADDED Viewed

Binary file (2.21 kB). View file

SynTool/mcts/__pycache__/expansion.cpython-310.pyc ADDED Viewed

Binary file (2.86 kB). View file

SynTool/mcts/__pycache__/node.cpython-310.pyc ADDED Viewed

Binary file (2.15 kB). View file

SynTool/mcts/__pycache__/search.cpython-310.pyc ADDED Viewed

Binary file (4.68 kB). View file

SynTool/mcts/__pycache__/tree.cpython-310.pyc ADDED Viewed

Binary file (18.7 kB). View file

SynTool/mcts/evaluation.py ADDED Viewed

	@@ -0,0 +1,59 @@

+"""
+Module containing a class that represents a value function for prediction of synthesisablity
+of new nodes in the search tree
+"""
+import logging
+import torch
+from pathlib import Path
+from SynTool.chem.retron import compose_retrons
+from SynTool.ml.networks.value import ValueNetwork
+from SynTool.ml.training import mol_to_pyg
+class ValueFunction:
+    """
+    Value function based on value neural network for node evaluation (synthesisability prediction) in MCTS
+    """
+    def __init__(self, weights_path: str) -> None:
+        """
+        The value function predicts the probability to synthesize the target molecule with available building blocks
+        starting from a given retron.
+        :param weights_path: The value network weights location
+        :type weights_path: Path
+        """
+        value_net = ValueNetwork.load_from_checkpoint(
+            weights_path,
+            map_location=torch.device("cpu")
+        )
+        self.value_network = value_net.eval()
+    def predict_value(self, retrons: list) -> float:
+        """
+        The function predicts a value based on the given retrons. For prediction, retrons must be composed into a single
+        molecule (product)
+        :param retrons: The list of retrons
+        :type retrons: list
+        """
+        molecule = compose_retrons(retrons=retrons, exclude_small=True)
+        pyg_graph = mol_to_pyg(molecule)
+        if pyg_graph:
+            with torch.no_grad():
+                value_pred = self.value_network.forward(pyg_graph)[0].item()
+        else:
+            try:
+                logging.debug(f"Molecule {str(molecule)} was not preprocessed. Giving value equal to -1e6.")
+            except:
+                logging.debug(f"There is a molecule for which SMILES cannot be generated")
+            value_pred = -1e6
+        return value_pred

SynTool/mcts/expansion.py ADDED Viewed

	@@ -0,0 +1,83 @@

+"""
+Module containing a class that represents a policy function for node expansion in the search tree
+"""
+import torch
+import torch_geometric
+from SynTool.chem.retron import Retron
+from SynTool.ml.networks.policy import PolicyNetwork
+from SynTool.ml.training import mol_to_pyg
+from SynTool.utils.config import PolicyNetworkConfig
+class PolicyFunction:
+    """
+    Policy function based on policy neural network for node expansion in MCTS
+    """
+    def __init__(self, policy_config: PolicyNetworkConfig, compile: bool = False):
+        """
+        Initializes the expansion function (ranking or filter policy network).
+        :param policy_config: A configuration object settings for the expansion policy
+        :type policy_config: PolicyConfig
+        :param compile: XX # TODO what is compile # TODO2 compile is a bad variable name - is a builtin function name
+        :type compile: bool
+        """
+        self.config = policy_config
+        policy_net = PolicyNetwork.load_from_checkpoint(
+            self.config.weights_path,
+            map_location=torch.device("cpu"),
+            batch_size=1,
+            dropout=0
+        )
+        policy_net = policy_net.eval()
+        if compile:
+            self.policy_net = torch_geometric.compile(policy_net, dynamic=True)
+        else:
+            self.policy_net = policy_net
+    def predict_reaction_rules(self, retron: Retron, reaction_rules: list):  # TODO what is output - finish annotation
+        """
+        The policy function predicts the list of reaction rules given a retron.
+        :param retron: The current retron for which the reaction rules are predicted
+        :type retron: Retron
+        :param reaction_rules: The list of reaction rules from which applicable reaction rules are predicted and selected.
+        :type reaction_rules: list
+        """
+        pyg_graph = mol_to_pyg(retron.molecule, canonicalize=False)
+        if pyg_graph:
+            with torch.no_grad():
+                if self.policy_net.policy_type == "filtering":
+                    probs, priority = self.policy_net.forward(pyg_graph)
+                if self.policy_net.policy_type == "ranking":
+                    probs = self.policy_net.forward(pyg_graph)
+            del pyg_graph
+        else:
+            return []
+        probs = probs[0].double()
+        if self.policy_net.policy_type == "filtering":
+            priority = priority[0].double()
+            priority_coef = self.config.priority_rules_fraction
+            probs = (1 - priority_coef) * probs + priority_coef * priority
+        sorted_probs, sorted_rules = torch.sort(probs, descending=True)
+        sorted_probs, sorted_rules = (
+            sorted_probs[: self.config.top_rules],
+            sorted_rules[: self.config.top_rules],
+        )
+        if self.policy_net.policy_type == "filtering":
+            sorted_probs = torch.softmax(sorted_probs, -1)
+        sorted_probs, sorted_rules = sorted_probs.tolist(), sorted_rules.tolist()
+        for prob, rule_id in zip(sorted_probs, sorted_rules):
+            if prob > self.config.rule_prob_threshold:  # TODO it will destroy all search if it is not correct (>0.5)
+                yield prob, reaction_rules[rule_id], rule_id

SynTool/mcts/node.py ADDED Viewed

	@@ -0,0 +1,49 @@

+"""
+Module containing a class Node that represents a node in the search tree
+"""
+class Node:
+    """
+    Node class represents a node in the search tree
+    """
+    def __init__(self, retrons_to_expand: tuple = None, new_retrons: tuple = None) -> None:
+        """
+        The function initializes the new Node object.
+        :param retrons_to_expand: The tuple of retrons to be expanded. The first retron in the tuple is the current
+        retron which will be expanded (for which new retrons will be generated by applying the predicted reaction
+        rules). When the first retron has been successfully expanded, the second retron becomes the current retron
+        to be expanded.
+        :param new_retrons: The tuple of new retrons generated by applying the reaction rule. New retrons have already
+        been added to the retrons_to_expand (see Tree._expand_node). Here they are stored for information.
+        """
+        self.retrons_to_expand = retrons_to_expand
+        self.new_retrons = new_retrons
+        if len(self.retrons_to_expand) == 0:
+            self.curr_retron = tuple()
+        else:
+            self.curr_retron = self.retrons_to_expand[0]
+            self.next_retrons = self.retrons_to_expand[1:]
+    def __len__(self) -> int:
+        """
+        The number of retrons in this node to expand.
+        """
+        return len(self.retrons_to_expand)
+    def __repr__(self) -> str:
+        """
+        String representation of the node. Returns the smiles of retrons_to_expand and new_retrons.
+        """
+        return f"retrons_to_expand: {self.retrons_to_expand}\nnew_retrons: {self.new_retrons}"
+    def is_solved(self) -> bool:
+        """
+        Is terminal node. There are not retrons for expansion.
+        """
+        return len(self.retrons_to_expand) == 0

SynTool/mcts/search.py ADDED Viewed

	@@ -0,0 +1,135 @@

+"""
+Module containing functions for running tree search for the set of target molecules
+"""
+import csv
+import json
+from pathlib import Path
+from tqdm import tqdm
+from SynTool.interfaces.visualisation import to_table, extract_routes
+from SynTool.mcts.tree import Tree, TreeConfig
+from SynTool.mcts.evaluation import ValueFunction
+from SynTool.mcts.expansion import PolicyFunction
+from SynTool.utils import path_type
+from SynTool.utils.files import MoleculeReader
+from SynTool.utils.config import PolicyNetworkConfig
+def extract_tree_stats(tree, target):
+    """
+    Collects various statistics from a tree and returns them in a dictionary format
+    :param tree: The retro tree.
+    :param target: The target molecule or compound that you want to search for in the tree. It is
+    expected to be a string representing the SMILES notation of the target molecule
+    :return: A dictionary with the calculated statistics
+    """
+    newick_tree, newick_meta = tree.newickify(visits_threshold=0)
+    newick_meta_line = ";".join([f"{nid},{v[0]},{v[1]},{v[2]}" for nid, v in newick_meta.items()])
+    return {
+        "target_smiles": str(target),
+        "tree_size": len(tree),
+        "search_time": round(tree.curr_time, 1),
+        "found_paths": len(tree.winning_nodes),
+        "newick_tree": newick_tree,
+        "newick_meta": newick_meta_line,
+    }
+def tree_search(
+        targets_path: path_type,
+        tree_config: TreeConfig,
+        policy_config: PolicyNetworkConfig,
+        reaction_rules_path: path_type,
+        building_blocks_path: path_type,
+        policy_weights_path: path_type = None,  # TODO not used
+        value_weights_path: path_type = None,
+        results_root: path_type = "search_results"
+):
+    """
+    Performs a tree search on a set of target molecules using specified configuration and rules,
+    logging the results and statistics.
+    :param tree_config: The config object containing the configuration for the tree search.
+    :param policy_config: The config object containing the configuration for the policy.
+    :param reaction_rules_path: The path to the file containing reaction rules.
+    :param building_blocks_path: The path to the file containing building blocks.
+    :param targets_path: The path to the file containing the target molecules (in SDF or SMILES format).
+    :param value_weights_path: The path to the file containing value weights (optional).
+    :param results_root: The path to the directory where the results of the tree search will be saved. Defaults to 'search_results/'.
+    :param retropaths_files_name: The base name for the files that will be generated to store the retro paths. Defaults to 'retropath'. #TODO arg dont exist
+    This function configures and executes a tree search algorithm, leveraging reaction rules and building blocks
+    to find synthetic pathways for given target molecules. The results, including paths and statistics, are
+    saved in the specified directory. Logging is used to record the process and any issues encountered.
+    """
+    targets_file = Path(targets_path)
+    # results folder
+    results_root = Path(results_root)
+    if not results_root.exists():
+        results_root.mkdir()
+    # output files
+    stats_file = results_root.joinpath("tree_search_stats.csv")
+    paths_file = results_root.joinpath("extracted_paths.json")
+    retropaths_folder = results_root.joinpath("retropaths")
+    retropaths_folder.mkdir(exist_ok=True)
+    # stats header
+    stats_header = ["target_smiles", "tree_size", "search_time",
+                    "found_paths", "newick_tree", "newick_meta"]
+    # config
+    policy_function = PolicyFunction(policy_config=policy_config)
+    if tree_config.evaluation_type == 'gcn':
+        value_function = ValueFunction(weights_path=value_weights_path)
+    else:
+        value_function = None
+    # run search
+    n_solved = 0
+    extracted_paths = []
+    with MoleculeReader(targets_file) as targets_path, open(stats_file, "w", newline="\n") as csvfile:
+        statswriter = csv.DictWriter(csvfile, delimiter=",", fieldnames=stats_header)
+        statswriter.writeheader()
+        for ti, target in tqdm(enumerate(targets_path), total=len(targets_path)):
+            try:
+                # run search
+                tree = Tree(
+                    target=target,
+                    tree_config=tree_config,
+                    reaction_rules_path=reaction_rules_path,
+                    building_blocks_path=building_blocks_path,
+                    policy_function=policy_function,
+                    value_function=value_function,
+                )
+                _ = list(tree)
+            except:
+                continue
+            n_solved += bool(tree.winning_nodes)
+            # extract routes
+            extracted_paths.append(extract_routes(tree))
+            # retropaths
+            retropaths_file = retropaths_folder.joinpath(f"retropaths_target_{ti}.html")
+            to_table(tree, retropaths_file, extended=True)
+            # stats
+            statswriter.writerow(extract_tree_stats(tree, target))
+            csvfile.flush()
+            #
+            with open(paths_file, 'w') as f:
+                json.dump(extracted_paths, f)
+    print(f"Solved number of target molecules: {n_solved}")

SynTool/mcts/tree.py ADDED Viewed

	@@ -0,0 +1,659 @@

+"""
+Module containing a class Tree that used for tree search of retrosynthetic paths
+"""
+import logging
+from collections import deque, defaultdict
+from math import sqrt
+from random import choice, uniform
+from time import time
+from typing import Dict, Set, List, Tuple
+from CGRtools.containers import MoleculeContainer
+from CGRtools import smiles
+from numpy.random import uniform
+from tqdm.auto import tqdm
+from SynTool.utils.loading import load_building_blocks, load_reaction_rules
+from SynTool.chem.reaction import Reaction, apply_reaction_rule
+from SynTool.chem.retron import Retron
+from SynTool.mcts.evaluation import ValueFunction
+from SynTool.mcts.expansion import PolicyFunction
+from SynTool.mcts.node import Node
+from SynTool.utils.config import TreeConfig
+class Tree:
+    """
+    Tree class with attributes and methods for Monte-Carlo tree search
+    """
+    def __init__(
+        self,
+        target: MoleculeContainer,
+        tree_config: TreeConfig,
+        reaction_rules_path: str,
+        building_blocks_path: str,
+        policy_function: PolicyFunction,
+        value_function: ValueFunction = None,
+    ):
+        """
+        The function initializes a tree object with optional parameters for tree search for target molecule.
+        :param target: a target molecule for retrosynthesis paths search
+        :type target: MoleculeContainer
+        :param tree_config: a tree configuration file for retrosynthesis paths search
+        :type tree_config: TreeConfig
+        :param reaction_rules_path: a path for reaction rules file
+        :type reaction_rules_path: str
+        :param building_blocks_path: a path for building blocks file
+        :type building_blocks_path: str
+        :param policy_function: a policy function object
+        :type policy_function: PolicyFunction
+        :param value_function: a value function object
+        :type value_function: ValueFunction
+        """
+        # config parameters
+        self.config = tree_config
+        # check target
+        if isinstance(target, str):
+            target = smiles(target)
+        assert (bool(target)), "Target is not defined, is not a MoleculeContainer or have no atoms"
+        if target:
+            target.canonicalize()
+        target_retron = Retron(target, canonicalize=True)
+        target_retron.prev_retrons.append(Retron(target, canonicalize=True))
+        target_node = Node(retrons_to_expand=(target_retron,), new_retrons=(target_retron,))
+        # tree structure init
+        self.nodes: Dict[int, Node] = {1: target_node}
+        self.parents: Dict[int, int] = {1: 0}
+        self.children: Dict[int, Set[int]] = {1: set()}
+        self.winning_nodes: List[int] = list()
+        self.visited_nodes: Set[int] = set()
+        self.expanded_nodes: Set[int] = set()
+        self.nodes_visit: Dict[int, int] = {1: 0}
+        self.nodes_depth: Dict[int, int] = {1: 0}
+        self.nodes_prob: Dict[int, float] = {1: 0.0}
+        self.nodes_init_value: Dict[int, float] = {1: 0.0}
+        self.nodes_total_value: Dict[int, float] = {1: 0.0}
+        # tree building limits
+        self.curr_iteration: int = 0
+        self.curr_tree_size: int = 2
+        self.curr_time: float = 2
+        # utils
+        self._tqdm = None
+        # policy and value functions
+        self.policy_function = policy_function
+        if self.config.evaluation_type == "gcn":
+            if value_function is None:
+                raise ValueError(
+                    "Value function not specified while evaluation mode is 'gcn'"
+                )
+            else:
+                self.value_function = value_function
+        # building blocks and reaction reaction_rules
+        self.reaction_rules = load_reaction_rules(reaction_rules_path)
+        self.building_blocks = load_building_blocks(building_blocks_path)
+    def __len__(self) -> int:
+        """
+        Returns the current size (number of nodes) of a Tree.
+        """
+        return self.curr_tree_size - 1
+    def __iter__(self) -> "Tree":  # TODO what is annotation "Tree" -> Tree ?
+        """
+        The function is defining an iterator for a Tree object. Also needed for the bar progress display.
+        """
+        if not self._tqdm:
+            self._start_time = time()
+            self._tqdm = tqdm(
+                total=self.config.max_iterations, disable=self.config.silent
+            )
+        return self
+    def __repr__(self) -> str:
+        """
+        Returns a string representation of a Tree object (target smiles, tree size, and the number of found paths).
+        """
+        return self.report()
+    def __next__(self):  # TODO what is return - function annotation ? tuple (bool, [node id])
+        """
+        The __next__ function is used to do one iteration of the tree building.
+        """
+        if self.nodes[1].curr_retron.is_building_block(self.building_blocks, self.config.min_mol_size):
+            raise StopIteration("Target is building block \n")
+        if self.curr_iteration >= self.config.max_iterations:
+            self._tqdm.close()
+            raise StopIteration("Iterations limit exceeded. \n")
+        elif self.curr_tree_size >= self.config.max_tree_size:
+            self._tqdm.close()
+            raise StopIteration("Max tree size exceeded or all possible paths found")
+        elif self.curr_time >= self.config.max_time:
+            self._tqdm.close()
+            raise StopIteration("Time limit exceeded. \n")
+        # start new iteration
+        self.curr_iteration += 1
+        self.curr_time = time() - self._start_time
+        self._tqdm.update()
+        curr_depth, node_id = 0, 1  # start from the root node_id
+        explore_path = True
+        while explore_path:
+            self.visited_nodes.add(node_id)
+            if self.nodes_visit[node_id]:  # already visited
+                if not self.children[node_id]:  # dead node
+                    logging.debug(
+                        f"Tree search: bumped into node {node_id} which is dead"
+                    )
+                    self._update_visits(node_id)
+                    explore_path = False
+                else:
+                    node_id = self._select_node(node_id)  # select the child node
+                    curr_depth += 1
+            else:
+                if self.nodes[node_id].is_solved():  # found path!
+                    self._update_visits(node_id)  # this prevents expanding of bb node_id
+                    self.winning_nodes.append(node_id)
+                    return True, [node_id]
+                elif (
+                    curr_depth < self.config.max_depth
+                ):  # expand node if depth limit is not reached
+                    self._expand_node(node_id)
+                    if not self.children[node_id]:  # node was not expanded
+                        logging.debug(f"Tree search: node {node_id} was not expanded")
+                        value_to_backprop = -1.0
+                    else:
+                        self.expanded_nodes.add(node_id)
+                        if self.config.search_strategy == "evaluation_first":
+                            # recalculate node value based on children synthesisability and backpropagation
+                            child_values = [
+                                self.nodes_init_value[child_id]
+                                for child_id in self.children[node_id]
+                            ]
+                            if self.config.evaluation_agg == "max":
+                                value_to_backprop = max(child_values)
+                            elif self.config.evaluation_agg == "average":
+                                value_to_backprop = sum(child_values) / len(
+                                    self.children[node_id]
+                                )
+                            else:
+                                raise ValueError(
+                                    f"Invalid evaluation aggregation mode: {self.config.evaluation_agg} "
+                                    f"Allowed values are 'max', 'average'"
+                                )
+                        elif self.config.search_strategy == "expansion_first":
+                            value_to_backprop = self._get_node_value(node_id)
+                        else:
+                            raise ValueError(
+                                f"Invalid search_strategy: {self.config.search_strategy}: "
+                                f"Allowed values are 'expansion_first', 'evaluation_first'"
+                            )
+                    # backpropagation
+                    self._backpropagate(node_id, value_to_backprop)
+                    self._update_visits(node_id)
+                    explore_path = False
+                    if self.children[node_id]:
+                        # found after expansion
+                        found_after_expansion = set()
+                        for child_id in iter(self.children[node_id]):
+                            if self.nodes[child_id].is_solved():
+                                found_after_expansion.add(child_id)
+                                self.winning_nodes.append(child_id)
+                        if found_after_expansion:
+                            return True, list(found_after_expansion)
+                else:
+                    self._backpropagate(node_id, self.nodes_total_value[node_id])
+                    self._update_visits(node_id)
+                    explore_path = False
+        return False, [node_id]
+    def _ucb(self, node_id: int) -> float:
+        """
+        The function calculates the Upper Confidence Bound (UCB) for a given node.
+        :param node_id: The `node_id` parameter is an integer that represents the ID of a node in a tree
+        :type node_id: int
+        """
+        prob = self.nodes_prob[node_id]  # Predicted by policy network score
+        visit = self.nodes_visit[node_id]
+        if self.config.ucb_type == "puct":
+            u = (
+                self.config.c_ucb * prob * sqrt(self.nodes_visit[self.parents[node_id]])
+            ) / (visit + 1)
+            return self.nodes_total_value[node_id] + u
+        elif self.config.ucb_type == "uct":
+            u = (
+                self.config.c_ucb
+                * sqrt(self.nodes_visit[self.parents[node_id]])
+                / (visit + 1)
+            )
+            return self.nodes_total_value[node_id] + u
+        elif self.config.ucb_type == "value":
+            return self.nodes_init_value[node_id] / (visit + 1)
+        else:
+            raise ValueError(f"I don't know this UCB type: {self.config.ucb_type}")
+    def _select_node(self, node_id: int) -> int:
+        """
+        This function selects a node based on its UCB value and returns the ID of the node with the highest value of
+        the UCB function.
+        :param node_id: The `node_id` parameter is an integer that represents the ID of a node
+        :type node_id: int
+        """
+        if self.config.epsilon > 0:
+            n = uniform(0, 1)
+            if n < self.config.epsilon:
+                return choice(list(self.children[node_id]))
+        best_score, best_children = None, []
+        for child_id in self.children[node_id]:
+            score = self._ucb(child_id)
+            if best_score is None or score > best_score:
+                best_score, best_children = score, [child_id]
+            elif score == best_score:
+                best_children.append(child_id)
+        return choice(best_children)
+    def _expand_node(self, node_id: int) -> None:
+        """
+        The function expands a given node by generating new retrons with policy (expansion) policy.
+        :param node_id: The `node_id` parameter is an integer that represents the ID of the current node
+        :type node_id: int
+        """
+        curr_node = self.nodes[node_id]
+        prev_retrons = curr_node.curr_retron.prev_retrons
+        tmp_retrons = set()
+        for prob, rule, rule_id in self.policy_function.predict_reaction_rules(
+            curr_node.curr_retron, self.reaction_rules
+        ):
+            for products in apply_reaction_rule(curr_node.curr_retron.molecule, rule):
+                # check repeated products
+                if not products or not set(products) - tmp_retrons:
+                    continue
+                tmp_retrons.update(products)
+                for molecule in products:
+                    molecule.meta["reactor_id"] = rule_id
+                new_retrons = tuple(Retron(mol) for mol in products)
+                scaled_prob = prob * len(
+                    list(filter(lambda x: len(x) > self.config.min_mol_size, products))
+                )
+                if set(prev_retrons).isdisjoint(new_retrons):
+                    retrons_to_expand = (
+                        *curr_node.next_retrons,
+                        *(
+                            x
+                            for x in new_retrons
+                            if not x.is_building_block(
+                                self.building_blocks, self.config.min_mol_size
+                            )
+                        ),
+                    )
+                    child_node = Node(
+                        retrons_to_expand=retrons_to_expand, new_retrons=new_retrons
+                    )
+                    for new_retron in new_retrons:
+                        new_retron.prev_retrons = [new_retron, *prev_retrons]
+                    self._add_node(node_id, child_node, scaled_prob)
+    def _add_node(self, node_id: int, new_node: Node, policy_prob: float = None) -> None:
+        """
+        This function adds a new node to a tree with its predicted policy probability.
+        :param node_id: ID of the parent node
+        :type node_id: int
+        :param new_node: The `new_node` is an instance of the`Node` class
+        :type new_node: Node
+        :param policy_prob: The `policy_prob` a float value that represents the probability associated with a new node.
+        :type policy_prob: float
+        """
+        new_node_id = self.curr_tree_size
+        self.nodes[new_node_id] = new_node
+        self.parents[new_node_id] = node_id
+        self.children[node_id].add(new_node_id)
+        self.children[new_node_id] = set()
+        self.nodes_visit[new_node_id] = 0
+        self.nodes_prob[new_node_id] = policy_prob
+        self.nodes_depth[new_node_id] = self.nodes_depth[node_id] + 1
+        self.curr_tree_size += 1
+        if self.config.search_strategy == "evaluation_first":
+            node_value = self._get_node_value(new_node_id)
+        elif self.config.search_strategy == "expansion_first":
+            node_value = self.config.init_node_value
+        else:
+            raise ValueError(
+                f"Invalid search_strategy: {self.config.search_strategy}: "
+                f"Allowed values are 'expansion_first', 'evaluation_first'"
+            )
+        self.nodes_init_value[new_node_id] = node_value
+        self.nodes_total_value[new_node_id] = node_value
+    def _get_node_value(self, node_id: int) -> float:
+        """
+        This function calculates the value for the given node.
+        :param node_id: ID of the given node
+        :type node_id: int
+        """
+        node = self.nodes[node_id]
+        if self.config.evaluation_type == "random":
+            node_value = uniform()
+        elif self.config.evaluation_type == "rollout":
+            node_value = min(
+                (
+                    self._rollout_node(retron, current_depth=self.nodes_depth[node_id])
+                    for retron in node.retrons_to_expand
+                ),
+                default=1.0,
+            )
+        elif self.config.evaluation_type == "gcn":
+            node_value = self.value_function.predict_value(node.new_retrons)
+        else:
+            raise ValueError(
+                f"I don't know this evaluation mode: {self.config.evaluation_type}"
+            )
+        return node_value
+    def _update_visits(self, node_id: int) -> None:
+        """
+        The function updates the number of visits from a given node to a root node.
+        :param node_id: The ID of a current node
+        :type node_id: int
+        """
+        while node_id:
+            self.nodes_visit[node_id] += 1
+            node_id = self.parents[node_id]
+    def _backpropagate(self, node_id: int, value: float) -> None:
+        """
+        The function backpropagates a value through a tree of a given node specified by node_id.
+        :param node_id: The ID of a given node from which to backpropagate value
+        :type node_id: int
+        :param value: The value to backpropagate
+        :type value: float
+        """
+        while node_id:
+            if self.config.backprop_type == "muzero":
+                self.nodes_total_value[node_id] = (
+                    self.nodes_total_value[node_id] * self.nodes_visit[node_id] + value
+                ) / (self.nodes_visit[node_id] + 1)
+            elif self.config.backprop_type == "cumulative":
+                self.nodes_total_value[node_id] += value
+            else:
+                raise ValueError(
+                    f"I don't know this backpropagation type: {self.config.backprop_type}"
+                )
+            node_id = self.parents[node_id]
+    def _rollout_node(self, retron: Retron, current_depth: int = None) -> float:
+        """
+        The function `_rollout_node` performs a rollout simulation from a given node in a tree.
+        Given the current retron, find the first successful reaction and return the new retrons.
+        If the retron is a building_block, return 1.0, else check the first successful reaction;
+        If the reaction is not successful, return -1.0;
+        If the reaction is successful, but the generated retrons are not the building_blocks and the retrons
+        cannot be generated without exceeding current_depth threshold, return -0.5;
+        If the reaction is successful, but the retrons are not the building_blocks and the retrons
+        cannot be generated, return -1.0;
+        :param retron: A Retron object
+        :type retron: Retron
+        :param current_depth: The current depth of the tree
+        :type current_depth: int
+        """
+        max_depth = self.config.max_depth - current_depth
+        # retron checking
+        if retron.is_building_block(self.building_blocks, self.config.min_mol_size):
+            return 1.0
+        if max_depth == 0:
+            logging.debug("Rollout: tried to perform rollout on the leaf node")
+            return -0.5
+        # retron simulating
+        occurred_retrons = set()
+        retrons_to_expand = deque([retron])
+        history = defaultdict(dict)
+        rollout_depth = 0
+        while retrons_to_expand:
+            # Iterate through reactors and pick first successful reaction.
+            # Check products of the reaction if you can find them in in-building_blocks data
+            # If not, then add missed products to retrons_to_expand and try to decompose them
+            if len(history) >= max_depth:
+                logging.debug(
+                    f"Rollout: max depth of rollout is reached with these "
+                    f"retrons to expand: {retrons_to_expand} {history}",
+                )
+                reward = -0.5
+                return reward
+            current_retron = retrons_to_expand.popleft()
+            history[rollout_depth]["target"] = current_retron
+            occurred_retrons.add(current_retron)
+            # Pick the first successful reaction while iterating through reactors
+            reaction_rule_applied = False
+            for prob, rule, rule_id in self.policy_function.predict_reaction_rules(
+                current_retron, self.reaction_rules
+            ):
+                for products in apply_reaction_rule(current_retron.molecule, rule):
+                    if products:
+                        reaction_rule_applied = True
+                        break
+                if reaction_rule_applied:
+                    history[rollout_depth]["rule_index"] = rule_id
+                    break
+            if not reaction_rule_applied:
+                logging.debug(
+                    f"Rollout: no reaction rule was applied for the "
+                    f"molecule {current_retron} on rollout depth {rollout_depth}"
+                )
+                reward = -1.0
+                return reward
+            products = tuple(Retron(product) for product in products)  # TODO /!\ Is it ok how products is defined above (line 496) ? Seems to
+            # TODO /!\ consider only last iterable of apply_reaction_rule
+            history[rollout_depth]["products"] = products
+            # check loops
+            if any(x in occurred_retrons for x in products) and products:
+                # Sometimes manual can create a loop, when
+                logging.debug("Rollout: rollout got in the loop: %s", history)
+                # print('occurred_retrons')
+                reward = -1.0
+                return reward
+            if occurred_retrons.isdisjoint(products):
+                # Added number of atoms check
+                retrons_to_expand.extend(
+                    [
+                        x
+                        for x in products
+                        if not x.is_building_block(
+                            self.building_blocks, self.config.min_mol_size
+                        )
+                    ]
+                )
+                rollout_depth += 1
+        reward = 1.0
+        return reward
+    def report(self) -> str:
+        """
+        Returns the string representation of the tree.
+        """
+        return (
+            f"Tree for: {str(self.nodes[1].retrons_to_expand[0])}\n"
+            f"Number of nodes: {len(self)}\nNumber of visited nodes: {len(self.visited_nodes)}\n"
+            f"Found paths: {len(self.winning_nodes)}\nTime: {round(self.curr_time, 1)} seconds"
+        )
+    def path_score(self, node_id: int) -> float:
+        """
+        The function calculates the score of a given path from the node with node_id to the root node.
+        :param node_id: The ID of a given node
+        :type node_id: int
+        """
+        cumulated_nodes_value, path_length = 0, 0
+        while node_id:
+            path_length += 1
+            cumulated_nodes_value += self.nodes_total_value[node_id]
+            node_id = self.parents[node_id]
+        return cumulated_nodes_value / (path_length ** 2)
+    def path_to_node(self, node_id: int) -> list:
+        """
+        The function returns the path (list of IDs of nodes) to from a node specified by node_id to the root node.
+        :param node_id: The ID of a given node
+        :type node_id: int
+        """
+        nodes = []
+        while node_id:
+            nodes.append(node_id)
+            node_id = self.parents[node_id]
+        return [self.nodes[node_id] for node_id in reversed(nodes)]
+    def synthesis_path(self, node_id: int) -> Tuple[Reaction, ...]:
+        """
+        Given a node_id, return a tuple of Reactions that represent the synthesis path from the
+        node specified with node_id to the root node
+        :param node_id: The ID of a given node
+        :type node_id: int
+        """
+        nodes = self.path_to_node(node_id)
+        tmp = [
+            Reaction(
+                [x.molecule for x in after.new_retrons],
+                [before.curr_retron.molecule],
+            )
+            for before, after in zip(nodes, nodes[1:])
+        ]
+        for r in tmp:
+            r.clean2d()
+        return tuple(reversed(tmp))
+    def newickify(self, visits_threshold: int = 0, root_node_id: int = 1):  # TODO what is return here ?
+        """
+        Adopted from https://stackoverflow.com/questions/50003007/how-to-convert-python-dictionary-to-newick-form-format
+        :param visits_threshold: the minimum number of visits for the given node  # TODO is this explanation correct ?
+        :type visits_threshold: int
+        :param root_node_id: The ID of a root node
+        :type root_node_id: int
+        """
+        visited_nodes = set()
+        def newick_render_node(current_node_id: int) -> str:
+            """
+            Recursively generates a Newick string representation of a tree
+            :param current_node_id: The identifier of the current node in the tree
+            :type current_node_id: The identifier of the current node in the tree
+            :return: A string representation of a node in a Newick format
+            """
+            assert (
+                current_node_id not in visited_nodes
+            ), "Error: The tree may not be circular!"
+            node_visit = self.nodes_visit[current_node_id]
+            visited_nodes.add(current_node_id)
+            if self.children[current_node_id]:
+                # Nodes
+                children = [
+                    child
+                    for child in list(self.children[current_node_id])
+                    if self.nodes_visit[child] >= visits_threshold
+                ]
+                children_strings = [newick_render_node(child) for child in children]
+                children_strings = ",".join(children_strings)
+                if children_strings:
+                    return f"({children_strings}){current_node_id}:{node_visit}"
+                else:
+                    # Leafs within threshold
+                    return f"{current_node_id}:{node_visit}"
+            else:
+                # Leafs
+                return f"{current_node_id}:{node_visit}"
+        newick_string = newick_render_node(root_node_id) + ";"
+        meta = {}
+        for node_id in iter(visited_nodes):
+            node_value = round(self.nodes_total_value[node_id], 3)
+            node_synthesisability = round(self.nodes_init_value[node_id])
+            visit_in_node = self.nodes_visit[node_id]
+            meta[node_id] = (node_value, node_synthesisability, visit_in_node)
+        return newick_string, meta

SynTool/ml/__init__.py ADDED Viewed

File without changes

SynTool/ml/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (147 Bytes). View file

SynTool/ml/networks/__init__.py ADDED Viewed

File without changes

SynTool/ml/networks/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (156 Bytes). View file

SynTool/ml/networks/__pycache__/modules.cpython-310.pyc ADDED Viewed

Binary file (8.14 kB). View file