Initial setup

In [None]:
!pip install -r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt

In [None]:
from datetime import datetime
import os
from huggingface_hub import login, upload_folder
from google.colab import userdata
import shutil

HF_TOKEN = userdata.get('HF_TOKEN')
login(HF_TOKEN, True)
BASE_DATASET='mmlu'
REPO_ID='flunardelli/llm-metaeval'
BASE_FOLDER=f"/content/{BASE_DATASET}/"#{datetime.now().strftime('%Y-%m-%dT%H-%M-%S')}
OUTPUT_FOLDER=os.path.join(BASE_FOLDER,'output')
TASK_FOLDER=os.path.join(BASE_FOLDER,'tasks')
#shutil.rmtree(BASE_FOLDER)
os.makedirs(OUTPUT_FOLDER)
os.makedirs(TASK_FOLDER)
os.environ['HF_TOKEN'] = HF_TOKEN
os.environ['OUTPUT_FOLDER'] = OUTPUT_FOLDER
os.environ['TASK_FOLDER'] = TASK_FOLDER

def hf_upload_folder(folder_path):
 upload_folder(
 folder_path=folder_path,
 path_in_repo="evals/",
 repo_id=REPO_ID,
 token=HF_TOKEN,
 repo_type="dataset"
 )

def create_task(content, filename):
 filename_path = os.path.join(TASK_FOLDER,filename)
 with open(filename_path, "w") as f:
 f.write(content)

Create task for MMLU all datasets

In [None]:
YAML_mmlu_en_us_string = """
task: mmlu_all
dataset_path: cais/mmlu
dataset_name: all
description: "MMLU dataset"
test_split: test
fewshot_split: dev
fewshot_config:
 sampler: first_n
num_fewshot: 5
output_type: multiple_choice
doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
doc_to_choice: ["A", "B", "C", "D"]
doc_to_target: answer
metric_list:
 - metric: acc
 aggregation: mean
 higher_is_better: true
"""
create_task(YAML_mmlu_en_us_string, 'mmlu_en_us.yaml')
os.environ['TASKS'] = 'mmlu_all'


Llama Models

In [None]:
!lm_eval \
--model hf --model_args pretrained=meta-llama/Llama-3.2-1B-Instruct,revision=d0a2081ed47e20ce524e8bc5d132f3fad2f69ff0,trust_remote_code=False,dtype=bfloat16,parallelize=True \
--tasks $TASKS \
--include_path $TASK_FOLDER/. --output $OUTPUT_FOLDER --log_samples \
--batch_size auto &> run.log
#--limit 10 \

In [None]:
hf_upload_folder(BASE_FOLDER)

In [None]:
!lm_eval \
--model hf --model_args pretrained=meta-llama/Llama-3.2-3B-Instruct,revision=392a143b624368100f77a3eafaa4a2468ba50a72,trust_remote_code=False,dtype=bfloat16,parallelize=True \
--tasks $TASKS \
--include_path $TASK_FOLDER/. --output $OUTPUT_FOLDER --log_samples \
--batch_size auto &> run.log
#--limit 10 \

In [None]:
hf_upload_folder(BASE_FOLDER)

In [None]:
!lm_eval \
--model hf --model_args pretrained=meta-llama/Meta-Llama-3-8B,revision=62bd457b6fe961a42a631306577e622c83876cb6,trust_remote_code=False,dtype=bfloat16,parallelize=True \
--tasks $TASKS \
--include_path $TASK_FOLDER/. --output $OUTPUT_FOLDER --log_samples \
--batch_size auto &> run.log
#--limit 10 \

In [None]:
hf_upload_folder(BASE_FOLDER)

Mistral Models

In [None]:
!lm_eval \
--model hf --model_args pretrained=mistralai/Mixtral-8x7B-Instruct-v0.1,revision=41bd4c9e7e4fb318ca40e721131d4933966c2cc1,trust_remote_code=False,dtype=bfloat16,parallelize=True \
--tasks $TASKS \
--include_path $TASK_FOLDER/. --output $OUTPUT_FOLDER --log_samples \
--batch_size auto &> run.log
#--limit 10 \

In [None]:
hf_upload_folder(BASE_FOLDER)