Commit
·
0a52d93
1
Parent(s):
19e5543
Update README.md
Browse files
README.md
CHANGED
@@ -1,3 +1,77 @@
|
|
1 |
---
|
2 |
license: mit
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
license: mit
|
3 |
+
language:
|
4 |
+
- en
|
5 |
+
library_name: transformers
|
6 |
+
tags:
|
7 |
+
- ems
|
8 |
+
- esm2
|
9 |
+
- biology
|
10 |
+
- protein
|
11 |
+
- protein language model
|
12 |
+
- cafa 5
|
13 |
+
- protein function prediction
|
14 |
---
|
15 |
+
|
16 |
+
|
17 |
+
## Using the model
|
18 |
+
First, downlowd the file `go-basic.obo` [from here](https://huggingface.co/datasets/AmelieSchreiber/cafa_5)
|
19 |
+
and store the file locally, then provide the local path in the the code below:
|
20 |
+
|
21 |
+
```python
|
22 |
+
import torch
|
23 |
+
from transformers import AutoTokenizer, EsmForSequenceClassification
|
24 |
+
from sklearn.metrics import precision_recall_fscore_support
|
25 |
+
|
26 |
+
# 1. Parsing the go-basic.obo file
|
27 |
+
def parse_obo_file(file_path):
|
28 |
+
with open(file_path, 'r') as f:
|
29 |
+
data = f.read().split("[Term]")
|
30 |
+
|
31 |
+
terms = []
|
32 |
+
for entry in data[1:]:
|
33 |
+
lines = entry.strip().split("\n")
|
34 |
+
term = {}
|
35 |
+
for line in lines:
|
36 |
+
if line.startswith("id:"):
|
37 |
+
term["id"] = line.split("id:")[1].strip()
|
38 |
+
elif line.startswith("name:"):
|
39 |
+
term["name"] = line.split("name:")[1].strip()
|
40 |
+
elif line.startswith("namespace:"):
|
41 |
+
term["namespace"] = line.split("namespace:")[1].strip()
|
42 |
+
elif line.startswith("def:"):
|
43 |
+
term["definition"] = line.split("def:")[1].split('"')[1]
|
44 |
+
terms.append(term)
|
45 |
+
return terms
|
46 |
+
|
47 |
+
parsed_terms = parse_obo_file("go-basic.obo") # Replace `go-basic.obo` with your path
|
48 |
+
|
49 |
+
# 2. Load the saved model and tokenizer
|
50 |
+
model_path = "AmelieSchreiber/cafa_5_protein_function_prediction"
|
51 |
+
loaded_model = EsmForSequenceClassification.from_pretrained(model_path)
|
52 |
+
loaded_tokenizer = AutoTokenizer.from_pretrained(model_path)
|
53 |
+
|
54 |
+
# 3. The predict_protein_function function
|
55 |
+
def predict_protein_function(sequence, model, tokenizer, go_terms):
|
56 |
+
inputs = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True, max_length=1022)
|
57 |
+
model.eval()
|
58 |
+
with torch.no_grad():
|
59 |
+
outputs = model(**inputs)
|
60 |
+
predictions = torch.sigmoid(outputs.logits)
|
61 |
+
predicted_indices = torch.where(predictions > 0.05)[1].tolist()
|
62 |
+
|
63 |
+
functions = []
|
64 |
+
for idx in predicted_indices:
|
65 |
+
term_id = unique_terms[idx] # Use the unique_terms list from your training script
|
66 |
+
for term in go_terms:
|
67 |
+
if term["id"] == term_id:
|
68 |
+
functions.append(term["name"])
|
69 |
+
break
|
70 |
+
|
71 |
+
return functions
|
72 |
+
|
73 |
+
# 4. Predicting protein function for an example sequence
|
74 |
+
example_sequence = "MAYLGSLVQRRLELASGDRLEASLGVGSELDVRGDRVKAVGSLDLEEGRLEQAGVSMA" # Replace with your protein sequence
|
75 |
+
predicted_functions = predict_protein_function(example_sequence, loaded_model, loaded_tokenizer, parsed_terms)
|
76 |
+
print(predicted_functions)
|
77 |
+
```
|