BounharAbdelaziz
commited on
Commit
·
1656d75
1
Parent(s):
2cabce8
saving through HfApi
Browse files- constants.py +1 -0
- utils.py +117 -137
constants.py
CHANGED
@@ -2,6 +2,7 @@ from datasets import load_dataset
|
|
2 |
|
3 |
|
4 |
# Constants values
|
|
|
5 |
DATA_PATH = "atlasia/Arabic-LID-Leaderboard"
|
6 |
DIALECT_CONFUSION_LEADERBOARD_FILE = "darija_leaderboard_dialect_confusion.json"
|
7 |
MULTI_DIALECTS_LEADERBOARD_FILE = "darija_leaderboard_multi_dialects.json"
|
|
|
2 |
|
3 |
|
4 |
# Constants values
|
5 |
+
LEADERBOARD_PATH = "atlasia/Open-Arabic-Dialect-Identification-Leaderboard"
|
6 |
DATA_PATH = "atlasia/Arabic-LID-Leaderboard"
|
7 |
DIALECT_CONFUSION_LEADERBOARD_FILE = "darija_leaderboard_dialect_confusion.json"
|
8 |
MULTI_DIALECTS_LEADERBOARD_FILE = "darija_leaderboard_multi_dialects.json"
|
utils.py
CHANGED
@@ -14,15 +14,8 @@ from sklearn.metrics import (
|
|
14 |
)
|
15 |
import numpy as np
|
16 |
from constants import *
|
|
|
17 |
from pathlib import Path
|
18 |
-
import logging
|
19 |
-
|
20 |
-
|
21 |
-
def get_repo_file_path(filename):
|
22 |
-
"""Get the full path to a file in the repository root"""
|
23 |
-
repo_path = Path("/home/user/app")
|
24 |
-
file_path = repo_path / filename
|
25 |
-
return file_path
|
26 |
|
27 |
def predict_label(text, model, language_mapping_dict, use_mapping=False):
|
28 |
"""
|
@@ -190,64 +183,7 @@ def run_eval_one_vs_all(data_test, TARGET_LANG='Morocco'):
|
|
190 |
|
191 |
return out
|
192 |
|
193 |
-
|
194 |
-
def update_darija_one_vs_all_leaderboard(result_df, model_name, target_lang, DIALECT_CONFUSION_LEADERBOARD_FILE="darija_leaderboard_dialect_confusion.json"):
|
195 |
-
file_path = get_repo_file_path(DIALECT_CONFUSION_LEADERBOARD_FILE)
|
196 |
-
|
197 |
-
# Log file information for debugging
|
198 |
-
print(f"Attempting to access file at: {file_path}")
|
199 |
-
print(f"File exists: {file_path.exists()}")
|
200 |
-
print(f"File permissions: {oct(os.stat(file_path).st_mode)[-3:]}" if file_path.exists() else "File does not exist")
|
201 |
-
|
202 |
-
try:
|
203 |
-
# Try to read existing data
|
204 |
-
if file_path.exists():
|
205 |
-
try:
|
206 |
-
with open(file_path, "r") as f:
|
207 |
-
data = json.load(f)
|
208 |
-
except PermissionError:
|
209 |
-
print(f"Permission denied reading file: {file_path}")
|
210 |
-
raise
|
211 |
-
else:
|
212 |
-
data = []
|
213 |
-
# Try to create the file
|
214 |
-
try:
|
215 |
-
file_path.touch()
|
216 |
-
except PermissionError:
|
217 |
-
print(f"Permission denied creating file: {file_path}")
|
218 |
-
raise
|
219 |
-
|
220 |
-
# Process the results for each dialect/country
|
221 |
-
for _, row in result_df.iterrows():
|
222 |
-
dialect = row['dialect']
|
223 |
-
if dialect == 'Other':
|
224 |
-
continue
|
225 |
-
|
226 |
-
target_entry = next((item for item in data if target_lang in item), None)
|
227 |
-
if target_entry is None:
|
228 |
-
target_entry = {target_lang: {}}
|
229 |
-
data.append(target_entry)
|
230 |
-
|
231 |
-
country_data = target_entry[target_lang]
|
232 |
-
|
233 |
-
if dialect not in country_data:
|
234 |
-
country_data[dialect] = {}
|
235 |
-
|
236 |
-
country_data[dialect][model_name] = float(row['false_positive_rate'])
|
237 |
-
|
238 |
-
# Try to write the updated data
|
239 |
-
try:
|
240 |
-
with open(file_path, "w") as f:
|
241 |
-
json.dump(data, f, indent=4)
|
242 |
-
print(f"Successfully wrote to file: {file_path}")
|
243 |
-
except PermissionError:
|
244 |
-
print(f"Permission denied writing to file: {file_path}")
|
245 |
-
raise
|
246 |
-
|
247 |
-
except Exception as e:
|
248 |
-
print(f"Error handling file {file_path}: {str(e)}")
|
249 |
-
raise
|
250 |
-
|
251 |
def handle_evaluation(model_path, model_path_bin, use_mapping=False):
|
252 |
|
253 |
# download model and get the model path
|
@@ -364,76 +300,6 @@ def process_results_file(file, uploaded_model_name, base_path_save="./atlasia/su
|
|
364 |
|
365 |
return create_leaderboard_display_multilingual(df_multilingual, target_label, default_metrics), status_message
|
366 |
|
367 |
-
def update_darija_multilingual_leaderboard(result_df, model_name, MULTI_DIALECTS_LEADERBOARD_FILE="darija_leaderboard_multi_dialects.json"):
|
368 |
-
file_path = get_repo_file_path(MULTI_DIALECTS_LEADERBOARD_FILE)
|
369 |
-
|
370 |
-
# Log file information for debugging
|
371 |
-
print(f"Attempting to access file at: {file_path}")
|
372 |
-
print(f"File exists: {file_path.exists()}")
|
373 |
-
print(f"File permissions: {oct(os.stat(file_path).st_mode)[-3:]}" if file_path.exists() else "File does not exist")
|
374 |
-
|
375 |
-
try:
|
376 |
-
# Try to read existing data
|
377 |
-
if file_path.exists():
|
378 |
-
try:
|
379 |
-
with open(file_path, "r") as f:
|
380 |
-
data = json.load(f)
|
381 |
-
except PermissionError:
|
382 |
-
print(f"Permission denied reading file: {file_path}")
|
383 |
-
raise
|
384 |
-
else:
|
385 |
-
data = []
|
386 |
-
# Try to create the file
|
387 |
-
try:
|
388 |
-
file_path.touch()
|
389 |
-
except PermissionError:
|
390 |
-
print(f"Permission denied creating file: {file_path}")
|
391 |
-
raise
|
392 |
-
|
393 |
-
# Process the results for each dialect/country
|
394 |
-
for _, row in result_df.iterrows():
|
395 |
-
country = row['country']
|
396 |
-
if country == 'Other':
|
397 |
-
continue
|
398 |
-
|
399 |
-
metrics = {
|
400 |
-
'f1_score': float(row['f1_score']),
|
401 |
-
'precision': float(row['precision']),
|
402 |
-
'recall': float(row['recall']),
|
403 |
-
'macro_f1_score': float(row['macro_f1_score']),
|
404 |
-
'micro_f1_score': float(row['micro_f1_score']),
|
405 |
-
'weighted_f1_score': float(row['weighted_f1_score']),
|
406 |
-
'specificity': float(row['specificity']),
|
407 |
-
'false_positive_rate': float(row['false_positive_rate']),
|
408 |
-
'false_negative_rate': float(row['false_negative_rate']),
|
409 |
-
'negative_predictive_value': float(row['negative_predictive_value']),
|
410 |
-
'balanced_accuracy': float(row['balanced_accuracy']),
|
411 |
-
'matthews_correlation': float(row['matthews_correlation']),
|
412 |
-
'n_test_samples': int(row['samples'])
|
413 |
-
}
|
414 |
-
|
415 |
-
country_entry = next((item for item in data if country in item), None)
|
416 |
-
if country_entry is None:
|
417 |
-
country_entry = {country: {}}
|
418 |
-
data.append(country_entry)
|
419 |
-
|
420 |
-
if country not in country_entry:
|
421 |
-
country_entry[country] = {}
|
422 |
-
country_entry[country][model_name] = metrics
|
423 |
-
|
424 |
-
# Try to write the updated data
|
425 |
-
try:
|
426 |
-
with open(file_path, "w") as f:
|
427 |
-
json.dump(data, f, indent=4)
|
428 |
-
print(f"Successfully wrote to file: {file_path}")
|
429 |
-
except PermissionError:
|
430 |
-
print(f"Permission denied writing to file: {file_path}")
|
431 |
-
raise
|
432 |
-
|
433 |
-
except Exception as e:
|
434 |
-
print(f"Error handling file {file_path}: {str(e)}")
|
435 |
-
raise
|
436 |
-
|
437 |
|
438 |
def load_leaderboard_one_vs_all(DIALECT_CONFUSION_LEADERBOARD_FILE):
|
439 |
current_dir = os.path.dirname(os.path.abspath(__file__))
|
@@ -589,4 +455,118 @@ def create_html_image(image_path):
|
|
589 |
|
590 |
def render_fixed_columns(df):
|
591 |
""" A function to render HTML table with fixed 'model' column for better visibility """
|
592 |
-
return NotImplementedError
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
)
|
15 |
import numpy as np
|
16 |
from constants import *
|
17 |
+
from huggingface_hub import HfApi, login
|
18 |
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
def predict_label(text, model, language_mapping_dict, use_mapping=False):
|
21 |
"""
|
|
|
183 |
|
184 |
return out
|
185 |
|
186 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
def handle_evaluation(model_path, model_path_bin, use_mapping=False):
|
188 |
|
189 |
# download model and get the model path
|
|
|
300 |
|
301 |
return create_leaderboard_display_multilingual(df_multilingual, target_label, default_metrics), status_message
|
302 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
303 |
|
304 |
def load_leaderboard_one_vs_all(DIALECT_CONFUSION_LEADERBOARD_FILE):
|
305 |
current_dir = os.path.dirname(os.path.abspath(__file__))
|
|
|
455 |
|
456 |
def render_fixed_columns(df):
|
457 |
""" A function to render HTML table with fixed 'model' column for better visibility """
|
458 |
+
return NotImplementedError
|
459 |
+
|
460 |
+
def update_repo_file(api, repo_id, filename, data):
|
461 |
+
"""Helper function to update a file in the repository"""
|
462 |
+
# Use the app directory
|
463 |
+
app_dir = Path("/home/user/app")
|
464 |
+
temp_file = app_dir / filename
|
465 |
+
|
466 |
+
# Write the updated data to file
|
467 |
+
with open(temp_file, "w") as f:
|
468 |
+
json.dump(data, f, indent=4)
|
469 |
+
|
470 |
+
# Upload the file back to the repository
|
471 |
+
api.upload_file(
|
472 |
+
path_or_fileobj=str(temp_file),
|
473 |
+
path_in_repo=filename,
|
474 |
+
repo_id=repo_id,
|
475 |
+
repo_type="space",
|
476 |
+
commit_message=f"Update {filename}"
|
477 |
+
)
|
478 |
+
|
479 |
+
def update_darija_one_vs_all_leaderboard(result_df, model_name, target_lang, DIALECT_CONFUSION_LEADERBOARD_FILE="darija_leaderboard_dialect_confusion.json"):
|
480 |
+
# Initialize Hugging Face API
|
481 |
+
api = HfApi()
|
482 |
+
|
483 |
+
try:
|
484 |
+
# Download existing file
|
485 |
+
try:
|
486 |
+
file_content = api.fetch_file_content(
|
487 |
+
repo_id=LEADERBOARD_PATH,
|
488 |
+
filename=DIALECT_CONFUSION_LEADERBOARD_FILE,
|
489 |
+
repo_type="model"
|
490 |
+
)
|
491 |
+
data = json.loads(file_content)
|
492 |
+
except:
|
493 |
+
data = []
|
494 |
+
|
495 |
+
# Process the results
|
496 |
+
for _, row in result_df.iterrows():
|
497 |
+
dialect = row['dialect']
|
498 |
+
if dialect == 'Other':
|
499 |
+
continue
|
500 |
+
|
501 |
+
target_entry = next((item for item in data if target_lang in item), None)
|
502 |
+
if target_entry is None:
|
503 |
+
target_entry = {target_lang: {}}
|
504 |
+
data.append(target_entry)
|
505 |
+
|
506 |
+
country_data = target_entry[target_lang]
|
507 |
+
|
508 |
+
if dialect not in country_data:
|
509 |
+
country_data[dialect] = {}
|
510 |
+
|
511 |
+
country_data[dialect][model_name] = float(row['false_positive_rate'])
|
512 |
+
|
513 |
+
# Update the file in the repository
|
514 |
+
update_repo_file(api, LEADERBOARD_PATH, DIALECT_CONFUSION_LEADERBOARD_FILE, data)
|
515 |
+
|
516 |
+
except Exception as e:
|
517 |
+
print(f"Error updating repository: {str(e)}")
|
518 |
+
raise
|
519 |
+
|
520 |
+
def update_darija_multilingual_leaderboard(result_df, model_name, MULTI_DIALECTS_LEADERBOARD_FILE="darija_leaderboard_multi_dialects.json"):
|
521 |
+
# Initialize Hugging Face API
|
522 |
+
api = HfApi()
|
523 |
+
|
524 |
+
try:
|
525 |
+
# Download existing file
|
526 |
+
try:
|
527 |
+
file_content = api.fetch_file_content(
|
528 |
+
repo_id=LEADERBOARD_PATH,
|
529 |
+
filename=MULTI_DIALECTS_LEADERBOARD_FILE,
|
530 |
+
repo_type="model"
|
531 |
+
)
|
532 |
+
data = json.loads(file_content)
|
533 |
+
except:
|
534 |
+
data = []
|
535 |
+
|
536 |
+
# Process the results
|
537 |
+
for _, row in result_df.iterrows():
|
538 |
+
country = row['country']
|
539 |
+
if country == 'Other':
|
540 |
+
continue
|
541 |
+
|
542 |
+
metrics = {
|
543 |
+
'f1_score': float(row['f1_score']),
|
544 |
+
'precision': float(row['precision']),
|
545 |
+
'recall': float(row['recall']),
|
546 |
+
'macro_f1_score': float(row['macro_f1_score']),
|
547 |
+
'micro_f1_score': float(row['micro_f1_score']),
|
548 |
+
'weighted_f1_score': float(row['weighted_f1_score']),
|
549 |
+
'specificity': float(row['specificity']),
|
550 |
+
'false_positive_rate': float(row['false_positive_rate']),
|
551 |
+
'false_negative_rate': float(row['false_negative_rate']),
|
552 |
+
'negative_predictive_value': float(row['negative_predictive_value']),
|
553 |
+
'balanced_accuracy': float(row['balanced_accuracy']),
|
554 |
+
'matthews_correlation': float(row['matthews_correlation']),
|
555 |
+
'n_test_samples': int(row['samples'])
|
556 |
+
}
|
557 |
+
|
558 |
+
country_entry = next((item for item in data if country in item), None)
|
559 |
+
if country_entry is None:
|
560 |
+
country_entry = {country: {}}
|
561 |
+
data.append(country_entry)
|
562 |
+
|
563 |
+
if country not in country_entry:
|
564 |
+
country_entry[country] = {}
|
565 |
+
country_entry[country][model_name] = metrics
|
566 |
+
|
567 |
+
# Update the file in the repository
|
568 |
+
update_repo_file(api, LEADERBOARD_PATH, MULTI_DIALECTS_LEADERBOARD_FILE, data)
|
569 |
+
|
570 |
+
except Exception as e:
|
571 |
+
print(f"Error updating repository: {str(e)}")
|
572 |
+
raise
|