Spaces:

igashov
/

DiffLinker

Sleeping

File size: 16,063 Bytes

95ba5bc

import csv
import networkx as nx

from joblib import Parallel, delayed
from rdkit import Chem
from rdkit.Chem import AllChem
from src.delinker_utils import sascorer


def read_triples_file(filename):
    '''Reads .smi file '''
    '''Returns array containing smiles strings of molecules'''
    smiles, names = [], []
    with open(filename, 'r') as f:
        for line in f:
            if line:
                smiles.append(line.strip().split(' ')[0:3])
    return smiles


def remove_dummys(smi_string):
    return Chem.MolToSmiles(Chem.RemoveHs(AllChem.ReplaceSubstructs(Chem.MolFromSmiles(smi_string),Chem.MolFromSmiles('*'),Chem.MolFromSmiles('[H]'),True)[0]))


def sa_filter(results, verbose=True):
    count = 0
    total = 0
    for processed, res in enumerate(results):
        total += len(res)
        for m in res:
            # Check SA score has improved
            if calc_mol_props(m[1])[1] < calc_mol_props(m[0])[1]:
                count += 1
        # Progress
        if verbose:
            if processed % 10 == 0:
                print("\rProcessed %d" % processed, end="")
    print("\r",end="")
    return count/total


def ring_check_res(res, clean_frag):
    check = True
    gen_mol = Chem.MolFromSmiles(res[1])
    linker = Chem.DeleteSubstructs(gen_mol, clean_frag)

    # Get linker rings
    ssr = Chem.GetSymmSSSR(linker)
    # Check rings
    for ring in ssr:
        for atom_idx in ring:
            for bond in linker.GetAtomWithIdx(atom_idx).GetBonds():
                if bond.GetBondType() == 2 and bond.GetBeginAtomIdx() in ring and bond.GetEndAtomIdx() in ring:
                    check = False
    return check


def ring_filter(results, verbose=True):
    count = 0
    total = 0
    du = Chem.MolFromSmiles('*')
    for processed, res in enumerate(results):
        total += len(res)
        for m in res:
            # Clean frags
            clean_frag = Chem.RemoveHs(AllChem.ReplaceSubstructs(Chem.MolFromSmiles(m[0]),du,Chem.MolFromSmiles('[H]'),True)[0])
            if ring_check_res(m, clean_frag):
                count += 1
        # Progress
        if verbose:
            if processed % 10 == 0:
                print("\rProcessed %d" % processed, end="")
    print("\r",end="")
    return count/total


def check_ring_filter(linker):
    check = True
    # Get linker rings
    ssr = Chem.GetSymmSSSR(linker)
    # Check rings
    for ring in ssr:
        for atom_idx in ring:
            for bond in linker.GetAtomWithIdx(atom_idx).GetBonds():
                if bond.GetBondType() == 2 and bond.GetBeginAtomIdx() in ring and bond.GetEndAtomIdx() in ring:
                    check = False
    return check


def check_pains(mol, pains_smarts):
    for pain in pains_smarts:
        if mol.HasSubstructMatch(pain):
            return False
    return True


def calc_2d_filters(toks, pains_smarts):
    try:
        # Input format: (Full Molecule (SMILES), Linker (SMILES), Unlinked Fragments (SMILES))
        frags = Chem.MolFromSmiles(toks[2])
        linker = Chem.MolFromSmiles(toks[1])
        full_mol = Chem.MolFromSmiles(toks[0])
        # Remove dummy atoms from unlinked fragments
        du = Chem.MolFromSmiles('*')
        clean_frag = Chem.RemoveHs(AllChem.ReplaceSubstructs(frags, du, Chem.MolFromSmiles('[H]'), True)[0])

        res = []
        # Check: Unlinked fragments in full molecule
        if len(full_mol.GetSubstructMatch(clean_frag)) > 0:
            # Check: SA score improved from unlinked fragments to full molecule
            if calc_sa_score_mol(full_mol) < calc_sa_score_mol(frags):
                res.append(True)
            else:
                res.append(False)
            # Check: No non-aromatic rings with double bonds
            if check_ring_filter(linker):
                res.append(True)
            else:
                res.append(False)
            # Check: Pass pains filters
            if check_pains(full_mol, pains_smarts):
                res.append(True)
            else:
                res.append(False)
        return res
    except:
        return [False, False, False]


def calc_filters_2d_dataset(results, pains_smarts_loc, n_cores=1):
    # Load pains filters
    with open(pains_smarts_loc, 'r') as f:
        pains_smarts = [Chem.MolFromSmarts(line[0], mergeHs=True) for line in csv.reader(f)]
    # calc_2d_filters([results[0][2], results[0][4], results[0][1]], pains_smarts)
    with Parallel(n_jobs=n_cores, backend='multiprocessing') as parallel:
        filters_2d = parallel(delayed(calc_2d_filters)([toks[2], toks[4], toks[1]], pains_smarts) for toks in results)

    return filters_2d


def calc_mol_props(smiles):
    # Create RDKit mol
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        print("Error passing: %s" % smiles)
        return None

    # QED
    qed = Chem.QED.qed(mol)
    # Synthetic accessibility score - number of cycles (rings with > 6 atoms)
    sas = sascorer.calculateScore(mol)
    # Cyles with >6 atoms
    ri = mol.GetRingInfo()
    nMacrocycles = 0
    for x in ri.AtomRings():
        if len(x) > 6:
            nMacrocycles += 1

    prop_array = [qed, sas]

    return prop_array


def calc_sa_score_mol(mol, verbose=False):
    if mol is None:
        if verbose:
            print("Error passing: %s" % mol)
        return None
    # Synthetic accessibility score
    return sascorer.calculateScore(mol)


def get_linker(full_mol, clean_frag, starting_point):
    # INPUT FORMAT: molecule (RDKit mol object), clean fragments (RDKit mol object), starting fragments (SMILES)

    # Get matches of fragments
    matches = list(full_mol.GetSubstructMatches(clean_frag))

    # If no matches, terminate
    if len(matches) == 0:
        print("No matches")
        return ""

    # Get number of atoms in linker
    linker_len = full_mol.GetNumHeavyAtoms() - clean_frag.GetNumHeavyAtoms()
    if linker_len == 0:
        return ""

    # Setup
    mol_to_break = Chem.Mol(full_mol)
    Chem.Kekulize(full_mol, clearAromaticFlags=True)

    poss_linker = []

    if len(matches) > 0:
        # Loop over matches
        for match in matches:
            mol_rw = Chem.RWMol(full_mol)
            # Get linker atoms
            linker_atoms = list(set(list(range(full_mol.GetNumHeavyAtoms()))).difference(match))
            linker_bonds = []
            atoms_joined_to_linker = []
            # Loop over starting fragments atoms
            # Get (i) bonds between starting fragments and linker, (ii) atoms joined to linker
            for idx_to_delete in sorted(match, reverse=True):
                nei = [x.GetIdx() for x in mol_rw.GetAtomWithIdx(idx_to_delete).GetNeighbors()]
                intersect = set(nei).intersection(set(linker_atoms))
                if len(intersect) == 1:
                    linker_bonds.append(mol_rw.GetBondBetweenAtoms(idx_to_delete, list(intersect)[0]).GetIdx())
                    atoms_joined_to_linker.append(idx_to_delete)
                elif len(intersect) > 1:
                    for idx_nei in list(intersect):
                        linker_bonds.append(mol_rw.GetBondBetweenAtoms(idx_to_delete, idx_nei).GetIdx())
                        atoms_joined_to_linker.append(idx_to_delete)

            # Check number of atoms joined to linker
            # If not == 2, check next match
            if len(set(atoms_joined_to_linker)) != 2:
                continue

            # Delete starting fragments atoms
            for idx_to_delete in sorted(match, reverse=True):
                mol_rw.RemoveAtom(idx_to_delete)

            linker = Chem.Mol(mol_rw)
            # Check linker required num atoms
            if linker.GetNumHeavyAtoms() == linker_len:
                mol_rw = Chem.RWMol(full_mol)
                # Delete linker atoms
                for idx_to_delete in sorted(linker_atoms, reverse=True):
                    mol_rw.RemoveAtom(idx_to_delete)
                frags = Chem.Mol(mol_rw)
                # Check there are two disconnected fragments
                if len(Chem.rdmolops.GetMolFrags(frags)) == 2:
                    # Fragment molecule into starting fragments and linker
                    fragmented_mol = Chem.FragmentOnBonds(mol_to_break, linker_bonds)
                    # Remove starting fragments from fragmentation
                    linker_to_return = Chem.Mol(fragmented_mol)
                    qp = Chem.AdjustQueryParameters()
                    qp.makeDummiesQueries = True
                    for f in starting_point.split('.'):
                        qfrag = Chem.AdjustQueryProperties(Chem.MolFromSmiles(f), qp)
                        linker_to_return = AllChem.DeleteSubstructs(linker_to_return, qfrag, onlyFrags=True)

                    # Check linker is connected and two bonds to outside molecule
                    if len(Chem.rdmolops.GetMolFrags(linker)) == 1 and len(linker_bonds) == 2:
                        Chem.Kekulize(linker_to_return, clearAromaticFlags=True)
                        # If for some reason a starting fragment isn't removed (and it's larger than the linker), remove (happens v. occassionally)
                        if len(Chem.rdmolops.GetMolFrags(linker_to_return)) > 1:
                            for frag in Chem.MolToSmiles(linker_to_return).split('.'):
                                if Chem.MolFromSmiles(frag).GetNumHeavyAtoms() == linker_len:
                                    return frag
                        return Chem.MolToSmiles(Chem.MolFromSmiles(Chem.MolToSmiles(linker_to_return)))

                    # If not, add to possible linkers (above doesn't capture some complex cases)
                    else:
                        fragmented_mol = Chem.MolFromSmiles(Chem.MolToSmiles(fragmented_mol), sanitize=False)
                        linker_to_return = AllChem.DeleteSubstructs(fragmented_mol, Chem.MolFromSmiles(starting_point))
                        poss_linker.append(Chem.MolToSmiles(linker_to_return))

    # If only one possibility, return linker
    if len(poss_linker) == 1:
        return poss_linker[0]
    # If no possibilities, process failed
    elif len(poss_linker) == 0:
        print("FAIL:", Chem.MolToSmiles(full_mol), Chem.MolToSmiles(clean_frag), starting_point)
        return ""
    # If multiple possibilities, process probably failed
    else:
        print("More than one poss linker. ", poss_linker)
        return poss_linker[0]


def get_linker_v2(full_mol, clean_frag):
    # INPUT FORMAT: molecule (RDKit mol object), clean fragments (RDKit mol object), starting fragments (SMILES)

    # Get matches of fragments
    matches = list(full_mol.GetSubstructMatches(clean_frag))

    # If no matches, terminate
    if len(matches) == 0:
        print("No matches")
        return ""

    # Get number of atoms in linker
    linker_len = full_mol.GetNumHeavyAtoms() - clean_frag.GetNumHeavyAtoms()
    if linker_len == 0:
        return ""

    # Setup
    mol_to_break = Chem.Mol(full_mol)
    Chem.Kekulize(full_mol, clearAromaticFlags=True)

    poss_linker = []

    if len(matches) > 0:
        # Loop over matches
        for match in matches:
            mol_rw = Chem.RWMol(full_mol)
            # Get linker atoms
            linker_atoms = list(set(list(range(full_mol.GetNumHeavyAtoms()))).difference(match))
            linker_bonds = []
            atoms_joined_to_linker = []
            # Loop over starting fragments atoms
            # Get (i) bonds between starting fragments and linker, (ii) atoms joined to linker
            for idx_to_delete in sorted(match, reverse=True):
                nei = [x.GetIdx() for x in mol_rw.GetAtomWithIdx(idx_to_delete).GetNeighbors()]
                intersect = set(nei).intersection(set(linker_atoms))
                if len(intersect) == 1:
                    linker_bonds.append(mol_rw.GetBondBetweenAtoms(idx_to_delete, list(intersect)[0]).GetIdx())
                    atoms_joined_to_linker.append(idx_to_delete)
                elif len(intersect) > 1:
                    for idx_nei in list(intersect):
                        linker_bonds.append(mol_rw.GetBondBetweenAtoms(idx_to_delete, idx_nei).GetIdx())
                        atoms_joined_to_linker.append(idx_to_delete)

            # Check number of atoms joined to linker
            # If not == 2, check next match
            if len(set(atoms_joined_to_linker)) != 2:
                continue

            # Delete starting fragments atoms
            for idx_to_delete in sorted(match, reverse=True):
                mol_rw.RemoveAtom(idx_to_delete)

            linker = Chem.Mol(mol_rw)
            # Check linker required num atoms
            if linker.GetNumHeavyAtoms() == linker_len:
                mol_rw = Chem.RWMol(full_mol)
                # Delete linker atoms
                for idx_to_delete in sorted(linker_atoms, reverse=True):
                    mol_rw.RemoveAtom(idx_to_delete)
                frags = Chem.Mol(mol_rw)

                # Check linker is connected and two bonds to outside molecule
                if len(Chem.rdmolops.GetMolFrags(linker)) == 1 and len(linker_bonds) == 2:
                    Chem.Kekulize(linker, clearAromaticFlags=True)
                    # If for some reason a starting fragment isn't removed (and it's larger than the linker), remove (happens v. occassionally)
                    if len(Chem.rdmolops.GetMolFrags(linker)) > 1:
                        for frag in Chem.MolToSmiles(linker).split('.'):
                            if Chem.MolFromSmiles(frag).GetNumHeavyAtoms() == linker_len:
                                return frag
                    return Chem.MolToSmiles(Chem.MolFromSmiles(Chem.MolToSmiles(linker)))

                # If not, add to possible linkers (above doesn't capture some complex cases)
                else:
                    poss_linker.append(Chem.MolToSmiles(linker))

    # If only one possibility, return linker
    if len(poss_linker) == 1:
        return poss_linker[0]
    # If no possibilities, process failed
    elif len(poss_linker) == 0:
        print("FAIL:", Chem.MolToSmiles(full_mol), Chem.MolToSmiles(clean_frag))
        return ""
    # If multiple possibilities, process probably failed
    else:
        print("More than one poss linker. ", poss_linker)
        return poss_linker[0]


def unique(results):
    total_dupes = 0
    total = 0
    for res in results:
        original_num = len(res)
        test_data = set(res)
        new_num = len(test_data)
        total_dupes += original_num - new_num
        total += original_num
    return 1 - total_dupes/float(total)


def check_recovered_original_mol_with_idx(results):
    outcomes = []
    rec_idx = []
    for res in results:
        success = False
        # Load original mol and canonicalise
        orig_mol = Chem.MolFromSmiles(res[0][0][0])
        Chem.RemoveStereochemistry(orig_mol)
        orig_mol = Chem.MolToSmiles(Chem.RemoveHs(orig_mol))
        #orig_mol = MolStandardize.canonicalize_tautomer_smiles(orig_mol)
        # Check generated mols
        for m in res:
            # print(1)
            gen_mol = Chem.MolFromSmiles(m[0][2])
            Chem.RemoveStereochemistry(gen_mol)
            gen_mol = Chem.MolToSmiles(Chem.RemoveHs(gen_mol))
            #gen_mol = MolStandardize.canonicalize_tautomer_smiles(gen_mol)
            if gen_mol == orig_mol:
                # outcomes.append(True)
                success = True
                rec_idx.append(m[1])
                # break
        if not success:
            outcomes.append(False)
        else:
            outcomes.append(True)
    return outcomes, rec_idx


def topology_from_rdkit(rdkit_molecule):
    topology = nx.Graph()
    for atom in rdkit_molecule.GetAtoms():
        # Add the atoms as nodes
        topology.add_node(atom.GetIdx(), atom_type=atom.GetAtomicNum())

        # Add the bonds as edges
    for bond in rdkit_molecule.GetBonds():
        topology.add_edge(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx(), bond_type=bond.GetBondType())

    return topology