Montazerh82 commited on
Commit
3535e25
1 Parent(s): d651b2a

add evaluate file

Browse files
Files changed (4) hide show
  1. .gitignore +2 -2
  2. evaluate.py +118 -0
  3. normalizer.py +2 -1
  4. result.csv +2 -0
.gitignore CHANGED
@@ -1,4 +1,4 @@
1
- .gitignore
2
  test.ipynb
3
  __pycache__/
4
-
 
 
 
1
  test.ipynb
2
  __pycache__/
3
+ cc.en.300.bin
4
+ cc.fa.300.bin
evaluate.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import fasttext
3
+ import fasttext.util
4
+ import pandas as pd
5
+ import random
6
+ import normalizer
7
+ from transformers import pipeline
8
+ from sklearn.metrics.pairwise import cosine_similarity
9
+ from datasets import Dataset, load_dataset
10
+ from transformers import AutoTokenizer, AutoModelForMaskedLM
11
+
12
+ random.seed(42)
13
+
14
+ tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/albert-fa-zwnj-base-v2")
15
+ # model = AutoModelForMaskedLM.from_pretrained("HooshvareLab/albert-fa-zwnj-base-v2")
16
+
17
+
18
+ # Load pre-trained word embeddings (e.g., fasttext)
19
+ fasttext.util.download_model('fa', if_exists='ignore') # English
20
+ embeddings = fasttext.load_model(r'cc.fa.300.bin')
21
+
22
+ # Example sentences with masked tokens
23
+ # masked_sentences = [
24
+ # ("The capital of France is [MASK].", "Paris"),
25
+ # ("The [MASK] is the largest mammal.", "whale"),
26
+ # ("The fastest land animal is the [MASK].", "cheetah")
27
+ # ]
28
+
29
+ # df = pd.read_excel('law_excel.xlsx', sheet_name='Sheet1')
30
+ # dataset = Dataset.from_pandas(df)
31
+ dataset = load_dataset('community-datasets/farsi_news', split='hamshahri')
32
+ dataset = dataset.shuffle(seed=42).select(range(100))
33
+
34
+ def tokenize_dataset(examples):
35
+ result = tokenizer(examples['summary'])
36
+
37
+ temp = {'masked_token': [-1] * len(result['input_ids']), 'input_ids': result['input_ids']}
38
+ for i, example in enumerate(result['input_ids']):
39
+
40
+ rand = random.randint(1, len(example)-2)
41
+ temp['masked_token'][i] = tokenizer.decode(example[rand])
42
+ temp['input_ids'][i][rand] = 4
43
+
44
+ result['input_ids'] = temp['input_ids']
45
+ result['masked_token'] = temp['masked_token']
46
+
47
+ return result
48
+
49
+ dataset = dataset.map(tokenize_dataset, batched=True)
50
+
51
+
52
+ # Initialize the fill-mask pipeline
53
+ fill_mask = pipeline("fill-mask", model="HooshvareLab/albert-fa-zwnj-base-v2")
54
+
55
+ # Define k for top-k predictions
56
+ k = 5
57
+ # Define similarity threshold
58
+ similarity_threshold = 0.5
59
+
60
+ # Initialize counters
61
+ TPP = 0
62
+ FPP = 0
63
+
64
+ FNR = 0
65
+ TPR = 0
66
+
67
+ def get_embedding(word):
68
+ try:
69
+ return embeddings[word]
70
+ except KeyError:
71
+ return None
72
+
73
+ for _, data in enumerate(dataset.iter(1)):
74
+ sentence = tokenizer.decode(data['input_ids'][0][1:-1])
75
+ sentence = normalizer.cleaning(sentence)
76
+ ground_truth = data['masked_token'][0]
77
+
78
+ # Get top-k predictions
79
+ predictions = fill_mask(sentence)[:k]
80
+ predicted_tokens = [pred['token_str'] for pred in predictions]
81
+
82
+ ground_truth_emb = get_embedding(ground_truth)
83
+
84
+ if ground_truth_emb is None:
85
+ continue # Skip if ground truth is not in the embeddings
86
+
87
+ flag = False
88
+ for token in predicted_tokens:
89
+ token_emb = get_embedding(token)
90
+ if token_emb is not None:
91
+ similarity = cosine_similarity([ground_truth_emb], [token_emb])[0][0]
92
+ if similarity >= similarity_threshold:
93
+ TPP += 1
94
+ flag = True
95
+ else:
96
+ FPP += 1
97
+ if flag:
98
+ TPR += 1
99
+ else:
100
+ FNR += 1
101
+
102
+
103
+ # Compute precision and recall
104
+ precision = TPP / (TPP + FPP) if (TPP + FPP) > 0 else 0
105
+ recall = TPR / (TPR + FNR) if (TPR + FNR) > 0 else 0
106
+
107
+ print(f"Precision: {precision:.2f}")
108
+ print(f"Recall: {recall:.2f}")
109
+
110
+ result = {'model': "HooshvareLab/albert-fa-zwnj-base-v2",
111
+ 'evaluation_dataset': 'allenai/c4',
112
+ 'Recall': recall,
113
+ 'Precision': precision,
114
+ 'F1': (recall*precision) / (recall + precision)}
115
+
116
+ result = pd.DataFrame([result])
117
+
118
+ result.to_csv('result.csv', index=False)
normalizer.py CHANGED
@@ -78,7 +78,8 @@ def cleaning(text):
78
 
79
  # replace some characters
80
  text = re.sub("ة", "ه", text)
81
-
 
82
  return text
83
 
84
 
 
78
 
79
  # replace some characters
80
  text = re.sub("ة", "ه", text)
81
+ text = text.replace(" [ZWNJ] ", "\u200c")
82
+
83
  return text
84
 
85
 
result.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ model,evaluation_dataset,Recall,Precision,F1
2
+ HooshvareLab/albert-fa-zwnj-base-v2,allenai/c4,0.72,0.272,0.1974193548387097