Spaces:
Sleeping
Sleeping
Montazerh82
commited on
Commit
•
3535e25
1
Parent(s):
d651b2a
add evaluate file
Browse files- .gitignore +2 -2
- evaluate.py +118 -0
- normalizer.py +2 -1
- result.csv +2 -0
.gitignore
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
.gitignore
|
2 |
test.ipynb
|
3 |
__pycache__/
|
4 |
-
|
|
|
|
|
|
1 |
test.ipynb
|
2 |
__pycache__/
|
3 |
+
cc.en.300.bin
|
4 |
+
cc.fa.300.bin
|
evaluate.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import fasttext
|
3 |
+
import fasttext.util
|
4 |
+
import pandas as pd
|
5 |
+
import random
|
6 |
+
import normalizer
|
7 |
+
from transformers import pipeline
|
8 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
9 |
+
from datasets import Dataset, load_dataset
|
10 |
+
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
11 |
+
|
12 |
+
random.seed(42)
|
13 |
+
|
14 |
+
tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/albert-fa-zwnj-base-v2")
|
15 |
+
# model = AutoModelForMaskedLM.from_pretrained("HooshvareLab/albert-fa-zwnj-base-v2")
|
16 |
+
|
17 |
+
|
18 |
+
# Load pre-trained word embeddings (e.g., fasttext)
|
19 |
+
fasttext.util.download_model('fa', if_exists='ignore') # English
|
20 |
+
embeddings = fasttext.load_model(r'cc.fa.300.bin')
|
21 |
+
|
22 |
+
# Example sentences with masked tokens
|
23 |
+
# masked_sentences = [
|
24 |
+
# ("The capital of France is [MASK].", "Paris"),
|
25 |
+
# ("The [MASK] is the largest mammal.", "whale"),
|
26 |
+
# ("The fastest land animal is the [MASK].", "cheetah")
|
27 |
+
# ]
|
28 |
+
|
29 |
+
# df = pd.read_excel('law_excel.xlsx', sheet_name='Sheet1')
|
30 |
+
# dataset = Dataset.from_pandas(df)
|
31 |
+
dataset = load_dataset('community-datasets/farsi_news', split='hamshahri')
|
32 |
+
dataset = dataset.shuffle(seed=42).select(range(100))
|
33 |
+
|
34 |
+
def tokenize_dataset(examples):
|
35 |
+
result = tokenizer(examples['summary'])
|
36 |
+
|
37 |
+
temp = {'masked_token': [-1] * len(result['input_ids']), 'input_ids': result['input_ids']}
|
38 |
+
for i, example in enumerate(result['input_ids']):
|
39 |
+
|
40 |
+
rand = random.randint(1, len(example)-2)
|
41 |
+
temp['masked_token'][i] = tokenizer.decode(example[rand])
|
42 |
+
temp['input_ids'][i][rand] = 4
|
43 |
+
|
44 |
+
result['input_ids'] = temp['input_ids']
|
45 |
+
result['masked_token'] = temp['masked_token']
|
46 |
+
|
47 |
+
return result
|
48 |
+
|
49 |
+
dataset = dataset.map(tokenize_dataset, batched=True)
|
50 |
+
|
51 |
+
|
52 |
+
# Initialize the fill-mask pipeline
|
53 |
+
fill_mask = pipeline("fill-mask", model="HooshvareLab/albert-fa-zwnj-base-v2")
|
54 |
+
|
55 |
+
# Define k for top-k predictions
|
56 |
+
k = 5
|
57 |
+
# Define similarity threshold
|
58 |
+
similarity_threshold = 0.5
|
59 |
+
|
60 |
+
# Initialize counters
|
61 |
+
TPP = 0
|
62 |
+
FPP = 0
|
63 |
+
|
64 |
+
FNR = 0
|
65 |
+
TPR = 0
|
66 |
+
|
67 |
+
def get_embedding(word):
|
68 |
+
try:
|
69 |
+
return embeddings[word]
|
70 |
+
except KeyError:
|
71 |
+
return None
|
72 |
+
|
73 |
+
for _, data in enumerate(dataset.iter(1)):
|
74 |
+
sentence = tokenizer.decode(data['input_ids'][0][1:-1])
|
75 |
+
sentence = normalizer.cleaning(sentence)
|
76 |
+
ground_truth = data['masked_token'][0]
|
77 |
+
|
78 |
+
# Get top-k predictions
|
79 |
+
predictions = fill_mask(sentence)[:k]
|
80 |
+
predicted_tokens = [pred['token_str'] for pred in predictions]
|
81 |
+
|
82 |
+
ground_truth_emb = get_embedding(ground_truth)
|
83 |
+
|
84 |
+
if ground_truth_emb is None:
|
85 |
+
continue # Skip if ground truth is not in the embeddings
|
86 |
+
|
87 |
+
flag = False
|
88 |
+
for token in predicted_tokens:
|
89 |
+
token_emb = get_embedding(token)
|
90 |
+
if token_emb is not None:
|
91 |
+
similarity = cosine_similarity([ground_truth_emb], [token_emb])[0][0]
|
92 |
+
if similarity >= similarity_threshold:
|
93 |
+
TPP += 1
|
94 |
+
flag = True
|
95 |
+
else:
|
96 |
+
FPP += 1
|
97 |
+
if flag:
|
98 |
+
TPR += 1
|
99 |
+
else:
|
100 |
+
FNR += 1
|
101 |
+
|
102 |
+
|
103 |
+
# Compute precision and recall
|
104 |
+
precision = TPP / (TPP + FPP) if (TPP + FPP) > 0 else 0
|
105 |
+
recall = TPR / (TPR + FNR) if (TPR + FNR) > 0 else 0
|
106 |
+
|
107 |
+
print(f"Precision: {precision:.2f}")
|
108 |
+
print(f"Recall: {recall:.2f}")
|
109 |
+
|
110 |
+
result = {'model': "HooshvareLab/albert-fa-zwnj-base-v2",
|
111 |
+
'evaluation_dataset': 'allenai/c4',
|
112 |
+
'Recall': recall,
|
113 |
+
'Precision': precision,
|
114 |
+
'F1': (recall*precision) / (recall + precision)}
|
115 |
+
|
116 |
+
result = pd.DataFrame([result])
|
117 |
+
|
118 |
+
result.to_csv('result.csv', index=False)
|
normalizer.py
CHANGED
@@ -78,7 +78,8 @@ def cleaning(text):
|
|
78 |
|
79 |
# replace some characters
|
80 |
text = re.sub("ة", "ه", text)
|
81 |
-
|
|
|
82 |
return text
|
83 |
|
84 |
|
|
|
78 |
|
79 |
# replace some characters
|
80 |
text = re.sub("ة", "ه", text)
|
81 |
+
text = text.replace(" [ZWNJ] ", "\u200c")
|
82 |
+
|
83 |
return text
|
84 |
|
85 |
|
result.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
model,evaluation_dataset,Recall,Precision,F1
|
2 |
+
HooshvareLab/albert-fa-zwnj-base-v2,allenai/c4,0.72,0.272,0.1974193548387097
|