albert-fa-zwnj-base-v2 / evaluate.py
Montazerh82's picture
add evaluate file
3535e25
import numpy as np
import fasttext
import fasttext.util
import pandas as pd
import random
import normalizer
from transformers import pipeline
from sklearn.metrics.pairwise import cosine_similarity
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM
random.seed(42)
tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/albert-fa-zwnj-base-v2")
# model = AutoModelForMaskedLM.from_pretrained("HooshvareLab/albert-fa-zwnj-base-v2")
# Load pre-trained word embeddings (e.g., fasttext)
fasttext.util.download_model('fa', if_exists='ignore') # English
embeddings = fasttext.load_model(r'cc.fa.300.bin')
# Example sentences with masked tokens
# masked_sentences = [
# ("The capital of France is [MASK].", "Paris"),
# ("The [MASK] is the largest mammal.", "whale"),
# ("The fastest land animal is the [MASK].", "cheetah")
# ]
# df = pd.read_excel('law_excel.xlsx', sheet_name='Sheet1')
# dataset = Dataset.from_pandas(df)
dataset = load_dataset('community-datasets/farsi_news', split='hamshahri')
dataset = dataset.shuffle(seed=42).select(range(100))
def tokenize_dataset(examples):
result = tokenizer(examples['summary'])
temp = {'masked_token': [-1] * len(result['input_ids']), 'input_ids': result['input_ids']}
for i, example in enumerate(result['input_ids']):
rand = random.randint(1, len(example)-2)
temp['masked_token'][i] = tokenizer.decode(example[rand])
temp['input_ids'][i][rand] = 4
result['input_ids'] = temp['input_ids']
result['masked_token'] = temp['masked_token']
return result
dataset = dataset.map(tokenize_dataset, batched=True)
# Initialize the fill-mask pipeline
fill_mask = pipeline("fill-mask", model="HooshvareLab/albert-fa-zwnj-base-v2")
# Define k for top-k predictions
k = 5
# Define similarity threshold
similarity_threshold = 0.5
# Initialize counters
TPP = 0
FPP = 0
FNR = 0
TPR = 0
def get_embedding(word):
try:
return embeddings[word]
except KeyError:
return None
for _, data in enumerate(dataset.iter(1)):
sentence = tokenizer.decode(data['input_ids'][0][1:-1])
sentence = normalizer.cleaning(sentence)
ground_truth = data['masked_token'][0]
# Get top-k predictions
predictions = fill_mask(sentence)[:k]
predicted_tokens = [pred['token_str'] for pred in predictions]
ground_truth_emb = get_embedding(ground_truth)
if ground_truth_emb is None:
continue # Skip if ground truth is not in the embeddings
flag = False
for token in predicted_tokens:
token_emb = get_embedding(token)
if token_emb is not None:
similarity = cosine_similarity([ground_truth_emb], [token_emb])[0][0]
if similarity >= similarity_threshold:
TPP += 1
flag = True
else:
FPP += 1
if flag:
TPR += 1
else:
FNR += 1
# Compute precision and recall
precision = TPP / (TPP + FPP) if (TPP + FPP) > 0 else 0
recall = TPR / (TPR + FNR) if (TPR + FNR) > 0 else 0
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
result = {'model': "HooshvareLab/albert-fa-zwnj-base-v2",
'evaluation_dataset': 'allenai/c4',
'Recall': recall,
'Precision': precision,
'F1': (recall*precision) / (recall + precision)}
result = pd.DataFrame([result])
result.to_csv('result.csv', index=False)