BounharAbdelaziz's picture
changed path to /home/user/app for accessing json files
2cabce8
raw
history blame
23.9 kB
import base64
from huggingface_hub import hf_hub_download
import fasttext
import os
import json
import pandas as pd
from sklearn.metrics import (
precision_score,
recall_score,
f1_score,
confusion_matrix,
balanced_accuracy_score,
matthews_corrcoef
)
import numpy as np
from constants import *
from pathlib import Path
import logging
def get_repo_file_path(filename):
"""Get the full path to a file in the repository root"""
repo_path = Path("/home/user/app")
file_path = repo_path / filename
return file_path
def predict_label(text, model, language_mapping_dict, use_mapping=False):
"""
Runs predictions for a fasttext model.
Args:
text (str): The input text to classify.
model (fasttext.FastText._FastText): The fasttext model to use for prediction.
language_mapping_dict (dict): A dictionary mapping fasttext labels to human-readable language names.
use_mapping (bool): Whether to use the language mapping dictionary.
Returns:
str: The predicted label for the input text.
"""
# Remove any newline characters and strip whitespace
text = str(text).strip().replace('\n', ' ')
if text == '':
# if empty text, return EMPTY
return 'EMPTY'
try:
# Get top prediction
prediction = model.predict(text, 1)
# Extract label and remove __label__ prefix
label = prediction[0][0].replace('__label__', '')
# Extract confidence score
confidence = prediction[1][0]
# map label to language using language_mapping_dict
if use_mapping:
# if label not found in mapping dict, set it to other as we are not taking them into account
label = language_mapping_dict.get(label, 'Other')
return label
except Exception as e:
print(f"Error processing text: {text}")
print(f"Exception: {e}")
return {'prediction_label': 'Error', 'prediction_confidence': 0.0}
def compute_classification_metrics(eval_dataset):
"""
Compute comprehensive classification metrics for each class.
Args:
data (pd.DataFrame): DataFrame containing 'dialect' as true labels and 'preds' as predicted labels.
Returns:
pd.DataFrame: DataFrame with detailed metrics for each class.
"""
# transform the dataset object into a pandas DataFrame object
data = pd.DataFrame(eval_dataset)
# Extract true labels and predictions
true_labels = list(data['dialect'])
predicted_labels = list(data['preds'])
# Handle all unique labels
labels = sorted(list(set(true_labels + predicted_labels)))
label_to_index = {label: index for index, label in enumerate(labels)}
# Convert labels to indices
true_indices = [label_to_index[label] for label in true_labels]
pred_indices = [label_to_index[label] for label in predicted_labels]
# Compute basic metrics
f1_scores = f1_score(true_indices, pred_indices, average=None, labels=range(len(labels)))
precision_scores = precision_score(true_indices, pred_indices, average=None, labels=range(len(labels)))
recall_scores = recall_score(true_indices, pred_indices, average=None, labels=range(len(labels)))
# Compute macro, weighted and micro f1 score
macro_f1_score = f1_score(true_indices, pred_indices, average='macro')
weighted_f1_score = f1_score(true_indices, pred_indices, average='weighted')
micro_f1_score = f1_score(true_indices, pred_indices, average='micro')
# Compute confusion matrix
conf_mat = confusion_matrix(true_indices, pred_indices, labels=range(len(labels)))
# Calculate various metrics per class
FP = conf_mat.sum(axis=0) - np.diag(conf_mat) # False Positives
FN = conf_mat.sum(axis=1) - np.diag(conf_mat) # False Negatives
TP = np.diag(conf_mat) # True Positives
TN = conf_mat.sum() - (FP + FN + TP) # True Negatives
# Calculate sample counts per class
samples_per_class = np.bincount(true_indices, minlength=len(labels))
# Calculate additional metrics
with np.errstate(divide='ignore', invalid='ignore'):
fp_rate = FP / (FP + TN) # False Positive Rate
fn_rate = FN / (FN + TP) # False Negative Rate
specificity = TN / (TN + FP) # True Negative Rate
npv = TN / (TN + FN) # Negative Predictive Value
# Replace NaN/inf with 0
metrics = [fp_rate, fn_rate, specificity, npv]
metrics = [np.nan_to_num(m, nan=0.0, posinf=0.0, neginf=0.0) for m in metrics]
fp_rate, fn_rate, specificity, npv = metrics
# Calculate overall metrics
balanced_acc = balanced_accuracy_score(true_indices, pred_indices)
mcc = matthews_corrcoef(true_indices, pred_indices)
# Compile results into a DataFrame
result_df = pd.DataFrame({
'country': labels,
'samples': samples_per_class,
'f1_score': f1_scores,
'macro_f1_score': macro_f1_score,
'weighted_f1_score': weighted_f1_score,
'micro_f1_score': micro_f1_score,
'precision': precision_scores,
'recall': recall_scores,
'specificity': specificity,
'false_positive_rate': fp_rate,
'false_negative_rate': fn_rate,
'true_positives': TP,
'false_positives': FP,
'true_negatives': TN,
'false_negatives': FN,
'negative_predictive_value': npv,
'balanced_accuracy': balanced_acc,
'matthews_correlation': mcc,
})
# Sort by number of samples (descending)
result_df = result_df.sort_values('samples', ascending=False)
# Format all numeric columns to 4 decimal places
numeric_cols = result_df.select_dtypes(include=[np.number]).columns
result_df[numeric_cols] = result_df[numeric_cols].round(4)
print(f'[INFO] result_df \n: {result_df}')
return result_df
def make_binary(dialect, target):
if dialect != target:
return 'Other'
return target
def run_eval_one_vs_all(data_test, TARGET_LANG='Morocco'):
# map to binary
df_test_preds = data_test.copy()
df_test_preds.loc[df_test_preds['dialect'] == TARGET_LANG, 'dialect'] = TARGET_LANG
df_test_preds.loc[df_test_preds['dialect'] != TARGET_LANG, 'dialect'] = 'Other'
# compute the fpr per dialect
dialect_counts = data_test.groupby('dialect')['dialect'].count().reset_index(name='size')
result_df = pd.merge(dialect_counts, data_test, on='dialect')
result_df = result_df.groupby(['dialect', 'size', 'preds'])['preds'].count()/result_df.groupby(['dialect', 'size'])['preds'].count()
result_df.sort_index(ascending=False, level='size', inplace=True)
# group by dialect and get the false positive rate
out = result_df.copy()
out.name = 'false_positive_rate'
out = out.reset_index()
out = out[out['preds']==TARGET_LANG].drop(columns=['preds', 'size'])
print(f'[INFO] out for TARGET_LANG={TARGET_LANG} \n: {out}')
return out
def update_darija_one_vs_all_leaderboard(result_df, model_name, target_lang, DIALECT_CONFUSION_LEADERBOARD_FILE="darija_leaderboard_dialect_confusion.json"):
file_path = get_repo_file_path(DIALECT_CONFUSION_LEADERBOARD_FILE)
# Log file information for debugging
print(f"Attempting to access file at: {file_path}")
print(f"File exists: {file_path.exists()}")
print(f"File permissions: {oct(os.stat(file_path).st_mode)[-3:]}" if file_path.exists() else "File does not exist")
try:
# Try to read existing data
if file_path.exists():
try:
with open(file_path, "r") as f:
data = json.load(f)
except PermissionError:
print(f"Permission denied reading file: {file_path}")
raise
else:
data = []
# Try to create the file
try:
file_path.touch()
except PermissionError:
print(f"Permission denied creating file: {file_path}")
raise
# Process the results for each dialect/country
for _, row in result_df.iterrows():
dialect = row['dialect']
if dialect == 'Other':
continue
target_entry = next((item for item in data if target_lang in item), None)
if target_entry is None:
target_entry = {target_lang: {}}
data.append(target_entry)
country_data = target_entry[target_lang]
if dialect not in country_data:
country_data[dialect] = {}
country_data[dialect][model_name] = float(row['false_positive_rate'])
# Try to write the updated data
try:
with open(file_path, "w") as f:
json.dump(data, f, indent=4)
print(f"Successfully wrote to file: {file_path}")
except PermissionError:
print(f"Permission denied writing to file: {file_path}")
raise
except Exception as e:
print(f"Error handling file {file_path}: {str(e)}")
raise
def handle_evaluation(model_path, model_path_bin, use_mapping=False):
# download model and get the model path
model_path_hub = hf_hub_download(repo_id=model_path, filename=model_path_bin, cache_dir=None)
# Load the trained model
print(f"[INFO] Loading model from Path: {model_path_hub}, using version {model_path_bin}...")
model = fasttext.load_model(model_path_hub)
# Transform to pandas DataFrame
print(f"[INFO] Converting evaluation dataset to Pandas DataFrame...")
df_eval = pd.DataFrame(eval_dataset)
# Predict labels using the model
print(f"[INFO] Running predictions...")
df_eval['preds'] = df_eval['text'].apply(lambda text: predict_label(text, model, language_mapping_dict, use_mapping=use_mapping))
# run the evaluation
result_df = run_eval(df_eval)
# set the model name
model_name = model_path + '/' + model_path_bin
# update the multilingual leaderboard
update_darija_multilingual_leaderboard(result_df, model_name, MULTI_DIALECTS_LEADERBOARD_FILE)
for target_lang in all_target_languages:
result_df_one_vs_all =run_eval_one_vs_all(df_eval, TARGET_LANG=target_lang)
update_darija_one_vs_all_leaderboard(result_df_one_vs_all, model_name, target_lang, DIALECT_CONFUSION_LEADERBOARD_FILE)
# load the updated leaderboard tables
df_multilingual = load_leaderboard_multilingual(MULTI_DIALECTS_LEADERBOARD_FILE)
df_one_vs_all = load_leaderboard_one_vs_all(DIALECT_CONFUSION_LEADERBOARD_FILE)
status_message = "**Evaluation now ended! 🤗**"
return create_leaderboard_display_multilingual(df_multilingual, target_label, default_metrics), status_message
def run_eval(df_eval):
"""Run evaluation on a dataset and compute metrics.
Args:
model: The model to evaluate.
DATA_PATH (str): Path to the dataset.
is_binary (bool): If True, evaluate as binary classification.
If False, evaluate as multi-class classification.
target_label (str): The target class label in binary mode.
Returns:
pd.DataFrame: A DataFrame containing evaluation metrics.
"""
# make a copy as the original one is used later
df_eval_multilingual = df_eval.copy()
# now drop the columns that are not needed, i.e. 'text'
df_eval_multilingual = df_eval_multilingual.drop(columns=['text', 'metadata', 'dataset_source'])
# Compute evaluation metrics
print(f"[INFO] Computing metrics...")
result_df = compute_classification_metrics(df_eval_multilingual)
# update_darija_multilingual_leaderboard(result_df, model_path, MULTI_DIALECTS_LEADERBOARD_FILE)
return result_df
def process_results_file(file, uploaded_model_name, base_path_save="./atlasia/submissions/", default_language='Morocco'):
try:
if file is None:
return "Please upload a file."
# Clean the model name to be safe for file paths
uploaded_model_name = uploaded_model_name.strip().replace(" ", "_")
print(f"[INFO] Uploaded model name: {uploaded_model_name}")
# Create the directory for saving submissions
path_saving = os.path.join(base_path_save, uploaded_model_name)
os.makedirs(path_saving, exist_ok=True)
# Define the full path to save the file
saved_file_path = os.path.join(path_saving, 'submission.csv')
# Read the uploaded file as DataFrame
print(f"[INFO] Loading csv results file...")
df_eval = pd.read_csv(file.name)
# Save the DataFrame
print(f"[INFO] Saving the file locally in: {saved_file_path}")
df_eval.to_csv(saved_file_path, index=False)
except Exception as e:
return f"Error processing file: {str(e)}"
# Compute evaluation metrics
print(f"[INFO] Computing metrics...")
result_df = compute_classification_metrics(df_eval)
# Update the leaderboards
update_darija_multilingual_leaderboard(result_df, uploaded_model_name, MULTI_DIALECTS_LEADERBOARD_FILE)
# TODO: implement this ove_vs_all differently for people only submitting csv file. They need to submit two files, one for multi-lang and the other for one-vs-all
# result_df_one_vs_all = run_eval_one_vs_all(...)
# update_darija_one_vs_all_leaderboard(...)
for target_lang in all_target_languages:
result_df_one_vs_all =run_eval_one_vs_all(df_eval, TARGET_LANG=target_lang)
update_darija_one_vs_all_leaderboard(result_df_one_vs_all, uploaded_model_name, target_lang, DIALECT_CONFUSION_LEADERBOARD_FILE)
# load the updated leaderboard tables
df_multilingual = load_leaderboard_multilingual(MULTI_DIALECTS_LEADERBOARD_FILE)
df_one_vs_all = load_leaderboard_one_vs_all(DIALECT_CONFUSION_LEADERBOARD_FILE)
status_message = "**Evaluation now ended! 🤗**"
return create_leaderboard_display_multilingual(df_multilingual, target_label, default_metrics), status_message
def update_darija_multilingual_leaderboard(result_df, model_name, MULTI_DIALECTS_LEADERBOARD_FILE="darija_leaderboard_multi_dialects.json"):
file_path = get_repo_file_path(MULTI_DIALECTS_LEADERBOARD_FILE)
# Log file information for debugging
print(f"Attempting to access file at: {file_path}")
print(f"File exists: {file_path.exists()}")
print(f"File permissions: {oct(os.stat(file_path).st_mode)[-3:]}" if file_path.exists() else "File does not exist")
try:
# Try to read existing data
if file_path.exists():
try:
with open(file_path, "r") as f:
data = json.load(f)
except PermissionError:
print(f"Permission denied reading file: {file_path}")
raise
else:
data = []
# Try to create the file
try:
file_path.touch()
except PermissionError:
print(f"Permission denied creating file: {file_path}")
raise
# Process the results for each dialect/country
for _, row in result_df.iterrows():
country = row['country']
if country == 'Other':
continue
metrics = {
'f1_score': float(row['f1_score']),
'precision': float(row['precision']),
'recall': float(row['recall']),
'macro_f1_score': float(row['macro_f1_score']),
'micro_f1_score': float(row['micro_f1_score']),
'weighted_f1_score': float(row['weighted_f1_score']),
'specificity': float(row['specificity']),
'false_positive_rate': float(row['false_positive_rate']),
'false_negative_rate': float(row['false_negative_rate']),
'negative_predictive_value': float(row['negative_predictive_value']),
'balanced_accuracy': float(row['balanced_accuracy']),
'matthews_correlation': float(row['matthews_correlation']),
'n_test_samples': int(row['samples'])
}
country_entry = next((item for item in data if country in item), None)
if country_entry is None:
country_entry = {country: {}}
data.append(country_entry)
if country not in country_entry:
country_entry[country] = {}
country_entry[country][model_name] = metrics
# Try to write the updated data
try:
with open(file_path, "w") as f:
json.dump(data, f, indent=4)
print(f"Successfully wrote to file: {file_path}")
except PermissionError:
print(f"Permission denied writing to file: {file_path}")
raise
except Exception as e:
print(f"Error handling file {file_path}: {str(e)}")
raise
def load_leaderboard_one_vs_all(DIALECT_CONFUSION_LEADERBOARD_FILE):
current_dir = os.path.dirname(os.path.abspath(__file__))
DIALECT_CONFUSION_LEADERBOARD_FILE = os.path.join(current_dir, DIALECT_CONFUSION_LEADERBOARD_FILE)
with open(DIALECT_CONFUSION_LEADERBOARD_FILE, "r") as f:
data = json.load(f)
# Initialize lists to store the flattened data
rows = []
# Process each target language's data
for leaderboard_data in data:
for target_language, results in leaderboard_data.items():
for language, models in results.items():
for model_name, false_positive_rate in models.items():
row = {
'target_language': target_language,
'language': language,
'model': model_name,
'false_positive_rate': false_positive_rate,
}
# Add all metrics to the row
rows.append(row)
# Convert to DataFrame
df = pd.DataFrame(rows)
# Pivot the DataFrame to create the desired structure: all languages in columns and models in rows, and each (model, target_language, language) = false_positive_rate
df_pivot = df.pivot(index=['model', 'target_language'], columns='language', values='false_positive_rate').reset_index()
return df_pivot
def load_leaderboard_multilingual(MULTI_DIALECTS_LEADERBOARD_FILE):
current_dir = os.path.dirname(os.path.abspath(__file__))
MULTI_DIALECTS_LEADERBOARD_FILE = os.path.join(current_dir, MULTI_DIALECTS_LEADERBOARD_FILE)
with open(MULTI_DIALECTS_LEADERBOARD_FILE, "r") as f:
data = json.load(f)
# Initialize lists to store the flattened data
rows = []
# Process each country's data
for country_data in data:
for country, models in country_data.items():
for model_name, metrics in models.items():
row = {
'country': country,
'model': model_name,
}
# Add all metrics to the row
row.update(metrics)
rows.append(row)
# Convert to DataFrame
df = pd.DataFrame(rows)
return df
def create_leaderboard_display_one_vs_all(df, target_language, selected_languages):
# Filter by target_language if specified
if target_language:
df = df[df['target_language'] == target_language]
# Remove the target_language from selected_languages
if target_language in selected_languages:
selected_languages = [lang for lang in selected_languages if lang != target_language]
# Select only the chosen languages (plus 'model' column)
columns_to_show = ['model'] + [language for language in selected_languages if language in df.columns]
# Sort by first selected metric by default
if selected_languages:
df = df.sort_values(by=selected_languages[0], ascending=False)
df = df[columns_to_show]
# Format numeric columns to 4 decimal places
numeric_cols = df.select_dtypes(include=['float64']).columns
df[numeric_cols] = df[numeric_cols].round(4)
return df, selected_languages
def create_leaderboard_display_multilingual(df, selected_country, selected_metrics):
# Filter by country if specified
if selected_country and selected_country.upper() != 'ALL':
# print(f"Filtering leaderboard by country: {selected_country}")
df = df[df['country'] == selected_country]
df = df.drop(columns=['country'])
# Select only the chosen metrics (plus 'model' column)
columns_to_show = ['model'] + [metric for metric in selected_metrics if metric in df.columns]
else:
# Select all metrics (plus 'country' and 'model' columns), if no country is selected or 'All' is selected for ease of comparison
columns_to_show = ['model', 'country'] + selected_metrics
# Sort by first selected metric by default
if selected_metrics:
df = df.sort_values(by=selected_metrics[0], ascending=False)
df = df[columns_to_show]
# Format numeric columns to 4 decimal places
numeric_cols = df.select_dtypes(include=['float64']).columns
df[numeric_cols] = df[numeric_cols].round(4)
return df
def update_leaderboard_multilingual(country, selected_metrics):
if not selected_metrics: # If no metrics selected, show all
selected_metrics = metrics
df = load_leaderboard_multilingual(MULTI_DIALECTS_LEADERBOARD_FILE)
display_df = create_leaderboard_display_multilingual(df, country, selected_metrics)
return display_df
def update_leaderboard_one_vs_all(target_language, selected_languages):
if not selected_languages: # If no language selected, show all defaults
selected_languages = default_languages
df = load_leaderboard_one_vs_all(DIALECT_CONFUSION_LEADERBOARD_FILE)
display_df, selected_languages = create_leaderboard_display_one_vs_all(df, target_language, selected_languages)
# to improve visibility in case the user chooses multiple language leading to many columns, the `model` column must remain fixed
# display_df = render_fixed_columns(display_df) # needs to be implemented
return display_df, selected_languages
def encode_image_to_base64(image_path):
""" encodes the image to base64"""
with open(image_path, "rb") as image_file:
encoded_string = base64.b64encode(image_file.read()).decode()
return encoded_string
def create_html_image(image_path):
""" Creates the html of the logo image from the image path input """
# Get base64 string of image
img_base64 = encode_image_to_base64(image_path)
# Create HTML string with embedded image and centering styles
html_string = f"""
<div style="display: flex; justify-content: center; align-items: center; width: 100%; text-align: center;">
<div style="max-width: 800px; margin: auto;">
<img src="data:image/jpeg;base64,{img_base64}"
style="max-width: 75%; height: auto; display: block; margin: 0 auto; margin-top: 50px;"
alt="Displayed Image">
</div>
</div>
"""
return html_string
def render_fixed_columns(df):
""" A function to render HTML table with fixed 'model' column for better visibility """
return NotImplementedError