stochasticribosome's picture
Add inference preprocessing
22dca11
raw
history blame
9.51 kB
import gradio as gr
import py3Dmol
from Bio.PDB import *
import numpy as np
from Bio.PDB import PDBParser
import pandas as pd
import torch
import os
from MDmodel import GNN_MD
import h5py
from transformMD import GNNTransformMD
import sys
import pytraj as pt
import pickle
# JavaScript functions
resid_hover = """function(atom,viewer) {{
if(!atom.label) {{
atom.label = viewer.addLabel('{0}:'+atom.atom+atom.serial,
{{position: atom, backgroundColor: 'mintcream', fontColor:'black'}});
}}
}}"""
hover_func = """
function(atom,viewer) {
if(!atom.label) {
atom.label = viewer.addLabel(atom.interaction,
{position: atom, backgroundColor: 'black', fontColor:'white'});
}
}"""
unhover_func = """
function(atom,viewer) {
if(atom.label) {
viewer.removeLabel(atom.label);
delete atom.label;
}
}"""
atom_mapping = {0:'H', 1:'C', 2:'N', 3:'O', 4:'F', 5:'P', 6:'S', 7:'CL', 8:'BR', 9:'I', 10: 'UNK'}
model = GNN_MD(11, 64)
state_dict = torch.load(
"best_weights_rep0.pt",
map_location=torch.device("cpu"),
)["model_state_dict"]
model.load_state_dict(state_dict)
model = model.to('cpu')
model.eval()
def run_leap(fileName, path):
leapText = """
source leaprc.protein.ff14SB
source leaprc.water.tip3p
exp = loadpdb PATH4amb.pdb
saveamberparm exp PATHexp.top PATHexp.crd
quit
"""
with open(path+"leap.in", "w") as outLeap:
outLeap.write(leapText.replace('PATH', path))
os.system("tleap -f "+path+"leap.in >> "+path+"leap.out")
def convert_to_amber_format(pdbName):
fileName, path = pdbName+'.pdb', pdbName+'/'
os.system("pdb4amber -i "+fileName+" -p -y -o "+path+"4amb.pdb -l "+path+"pdb4amber_protein.log")
run_leap(fileName, path)
traj = pt.iterload(path+'exp.crd', top = path+'exp.top')
pt.write_traj(path+fileName, traj, overwrite= True)
print(path+fileName+' was created. Please always use this file for inspection because the coordinates might get translated during amber file generation and thus might vary from the input pdb file.')
return pt.iterload(path+'exp.crd', top = path+'exp.top')
def get_maps(mapPath):
residueMap = pickle.load(open(mapPath+'atoms_residue_map_generate.pickle','rb'))
nameMap = pickle.load(open(mapPath+'atoms_name_map_generate.pickle','rb'))
typeMap = pickle.load(open(mapPath+'atoms_type_map_generate.pickle','rb'))
elementMap = pickle.load(open(mapPath+'map_atomType_element_numbers.pickle','rb'))
return residueMap, nameMap, typeMap, elementMap
def get_residues_atomwise(residues):
atomwise = []
for name, nAtoms in residues:
for i in range(nAtoms):
atomwise.append(name)
return atomwise
def get_begin_atom_index(traj):
natoms = [m.n_atoms for m in traj.top.mols]
molecule_begin_atom_index = [0]
x = 0
for i in range(len(natoms)):
x += natoms[i]
molecule_begin_atom_index.append(x)
print('molecule begin atom index', molecule_begin_atom_index, natoms)
return molecule_begin_atom_index
def get_traj_info(traj, mapPath):
coordinates = traj.xyz
residueMap, nameMap, typeMap, elementMap = get_maps(mapPath)
types = [typeMap[a.type] for a in traj.top.atoms]
elements = [elementMap[typ] for typ in types]
atomic_numbers = [a.atomic_number for a in traj.top.atoms]
molecule_begin_atom_index = get_begin_atom_index(traj)
residues = [(residueMap[res.name], res.n_atoms) for res in traj.top.residues]
residues_atomwise = get_residues_atomwise(residues)
return coordinates[0], elements, types, atomic_numbers, residues_atomwise, molecule_begin_atom_index
def write_h5_info(outName, struct, atoms_type, atoms_number, atoms_residue, atoms_element, molecules_begin_atom_index, atoms_coordinates_ref):
if os.path.isfile(outName):
os.remove(outName)
with h5py.File(outName, 'w') as oF:
subgroup = oF.create_group(struct)
subgroup.create_dataset('atoms_residue', data= atoms_residue, compression = "gzip", dtype='i8')
subgroup.create_dataset('molecules_begin_atom_index', data= molecules_begin_atom_index, compression = "gzip", dtype='i8')
subgroup.create_dataset('atoms_type', data= atoms_type, compression = "gzip", dtype='i8')
subgroup.create_dataset('atoms_number', data= atoms_number, compression = "gzip", dtype='i8')
subgroup.create_dataset('atoms_element', data= atoms_element, compression = "gzip", dtype='i8')
subgroup.create_dataset('atoms_coordinates_ref', data= atoms_coordinates_ref, compression = "gzip", dtype='f8')
def preprocess(pdbid: str = None, ouputfile: str = "inference_for_md.hdf5", mask: str = "!@H=", mappath: str = "/maps/"):
traj = convert_to_amber_format(pdbid)
atoms_coordinates_ref, atoms_element, atoms_type, atoms_number, atoms_residue, molecules_begin_atom_index = get_traj_info(traj[mask], mappath)
write_h5_info(ouputfile, pdbid, atoms_type, atoms_number, atoms_residue, atoms_element, molecules_begin_atom_index, atoms_coordinates_ref)
def get_pdb(pdb_code="", filepath=""):
try:
return filepath.name
except AttributeError as e:
if pdb_code is None or pdb_code == "":
return None
else:
os.system(f"wget -qnc https://files.rcsb.org/view/{pdb_code}.pdb")
return f"{pdb_code}.pdb"
def get_offset(pdb):
pdb_multiline = pdb.split("\n")
for line in pdb_multiline:
if line.startswith("ATOM"):
return int(line[22:27])
def get_pdbid_from_filename(filename: str):
# Assuming the filename would be of the standard form 11GS.pdb
return filename.split(".")[0]
def predict(pdb_code, pdb_file):
#path_to_pdb = get_pdb(pdb_code=pdb_code, filepath=pdb_file)
#pdb = open(path_to_pdb, "r").read()
# switch to misato env if not running from container
pdbid = get_pdbid_from_filename(pdb_file)
mdh5_file = "inference_for_md.hdf5"
mappath = "/maps"
mask = "!@H="
preprocess(pdbid=pdbid, ouputfile=mdh5_file, mask=mask, mappath=mappath)
md_H5File = h5py.File(mdh5_file)
column_names = ["x", "y", "z", "element"]
atoms_protein = pd.DataFrame(columns = column_names)
cutoff = md_H5File[pdbid]["molecules_begin_atom_index"][:][-1] # cutoff defines protein atoms
atoms_protein["x"] = md_H5File[pdbid]["atoms_coordinates_ref"][:][:cutoff, 0]
atoms_protein["y"] = md_H5File[pdbid]["atoms_coordinates_ref"][:][:cutoff, 1]
atoms_protein["z"] = md_H5File[pdbid]["atoms_coordinates_ref"][:][:cutoff, 2]
atoms_protein["element"] = md_H5File[pdbid]["atoms_element"][:][:cutoff]
item = {}
item["scores"] = 0
item["id"] = pdbid
item["atoms_protein"] = atoms_protein
transform = GNNTransformMD()
data_item = transform(item)
adaptability = model(data_item)
adaptability = adaptability.detach().numpy()
data = []
for i in range(adaptability.shape[0]):
data.append([i, atom_mapping[atoms_protein.iloc[i, atoms_protein.columns.get_loc("element")] - 1], atoms_protein.iloc[i, atoms_protein.columns.get_loc("x")],atoms_protein.iloc[i, atoms_protein.columns.get_loc("y")],atoms_protein.iloc[i, atoms_protein.columns.get_loc("z")],adaptability[i]])
topN = 100
topN_ind = np.argsort(adaptability)[::-1][:topN]
pdb = open(pdb_file.name, "r").read()
view = py3Dmol.view(width=600, height=400)
view.setBackgroundColor('white')
view.addModel(pdb, "pdb")
view.setStyle({'stick': {'colorscheme': {'prop': 'resi', 'C': 'turquoise'}}})
for i in range(topN):
view.addSphere({'center':{'x':atoms_protein.iloc[topN_ind[i], atoms_protein.columns.get_loc("x")], 'y':atoms_protein.iloc[topN_ind[i], atoms_protein.columns.get_loc("y")],'z':atoms_protein.iloc[topN_ind[i], atoms_protein.columns.get_loc("z")]},'radius':adaptability[topN_ind[i]]/1.5,'color':'orange','alpha':0.75})
view.zoomTo()
output = view._make_html().replace("'", '"')
x = f"""<!DOCTYPE html><html> {output} </html>""" # do not use ' in this input
return f"""<iframe style="width: 100%; height:420px" name="result" allow="midi; geolocation; microphone; camera;
display-capture; encrypted-media;" sandbox="allow-modals allow-forms
allow-scripts allow-same-origin allow-popups
allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
allowpaymentrequest="" frameborder="0" srcdoc='{x}'></iframe>""", pd.DataFrame(data, columns=['index','element','x','y','z','Adaptability'])
callback = gr.CSVLogger()
def run():
with gr.Blocks() as demo:
gr.Markdown("# Protein Adaptability Prediction")
#text_input = gr.Textbox()
#text_output = gr.Textbox()
#text_button = gr.Button("Flip")
inp = gr.Textbox(placeholder="PDB Code or upload file below", label="Input structure")
pdb_file = gr.File(label="PDB File Upload")
#with gr.Row():
# helix = gr.ColorPicker(label="helix")
# sheet = gr.ColorPicker(label="sheet")
# loop = gr.ColorPicker(label="loop")
single_btn = gr.Button(label="Run")
with gr.Row():
html = gr.HTML()
with gr.Row():
dataframe = gr.Dataframe()
single_btn.click(fn=predict, inputs=[inp, pdb_file], outputs=[html, dataframe])
demo.launch(server_name="0.0.0.0", server_port=7860)
if __name__ == "__main__":
run()