nicholasKluge
/

TeenyTinyLlama-160m-HateBR

@@ -11,31 +11,13 @@ pipeline_tag: text-classification
 tags:
 - hate-speech
 ---
-## bert-base-portuguese-cased-hatebr
-| Epoch | Training Loss | Validation Loss | Accuracy |
-|-------|---------------|------------------|----------|
-|   1   |    0.469500   |     0.529507     | 0.862143 |
-|   2   |    0.293200   |     0.383391     | 0.917857 |
-|   3   |    0.084900   |     0.429867     | 0.912857 |
-## nicholasKluge/Teeny-tiny-llama-162m-hatebr
-| Epoch | Training Loss | Validation Loss | Accuracy |
-|-------|---------------|------------------|----------|
-|   1   |    0.348100   |     0.296286     | 0.898571 |
-|   2   |    0.129100   |     0.371028     | 0.905714 |
-|   3   |    0.019300   |     0.428130     | 0.907143 |
-## gpt2-small-portuguese-hatebr
-| Epoch | Training Loss | Validation Loss | Accuracy |
-|-------|---------------|------------------|----------|
-|   1   |    0.475400   |     0.333722     | 0.864286 |
-|   2   |    0.338800   |     0.550519     | 0.852143 |
-|   3   |    0.207900   |     0.596878     | 0.874286 |
 ```python
@@ -49,10 +31,8 @@ from datasets import load_dataset, Dataset, DatasetDict
 from transformers import AutoTokenizer, DataCollatorWithPadding
 from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
-# Basic fine-tuning arguments
-token="your_token"
-task="ruanchaves/hatebr"
-model_name="nicholasKluge/Teeny-tiny-llama-162m"
 output_dir="checkpoint"
 learning_rate=5e-5
 per_device_train_batch_size=4
@@ -63,13 +43,10 @@ evaluation_strategy="epoch"
 save_strategy="epoch"
 hub_model_id="nicholasKluge/Teeny-tiny-llama-162m-hatebr"
-# Login on the hub to load and push
-login(token=token)
 # Load the task
-dataset = load_dataset(task)
-# Preprocess if needed
 train = dataset['train'].to_pandas()
 train = train[['instagram_comments', 'offensive_language']]
 train.columns = ['text', 'labels']
@@ -89,19 +66,15 @@ dataset = DatasetDict({
 # Create a `ModelForSequenceClassification`
 model = AutoModelForSequenceClassification.from_pretrained(
-    model_name,
     num_labels=2,
     id2label={0: "NEGATIVE", 1: "POSITIVE"},
     label2id={"NEGATIVE": 0, "POSITIVE": 1}
 )
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-# If model does not have a pad_token, we need to add it
-#tokenizer.pad_token = tokenizer._eos_token
-#model.config.pad_token_id = model.config.eos_token_id
-# Pre process the dataset
 def preprocess_function(examples):
     return tokenizer(examples["text"], truncation=True)
@@ -121,20 +94,18 @@ def compute_metrics(eval_pred):
 # Define training arguments
 training_args = TrainingArguments(
-    output_dir=output_dir,
-    learning_rate=learning_rate,
-    per_device_train_batch_size=per_device_train_batch_size,
-    per_device_eval_batch_size=per_device_eval_batch_size,
-    num_train_epochs=num_train_epochs,
-    weight_decay=weight_decay,
-    evaluation_strategy=evaluation_strategy,
-    save_strategy=save_strategy,
     load_best_model_at_end=True,
-    push_to_hub=False,
-    hub_token=token,
-    hub_private_repo=True,
-    hub_model_id=hub_model_id,
-    tf32=False,
 )
 # Define the Trainer

 tags:
 - hate-speech
 ---
+# TeenyTinyLlama-162m-HateBR
+TeenyTinyLlama is a series of small foundational models trained on Portuguese.
+This repository contains a version of [TeenyTinyLlama-162m](https://huggingface.co/nicholasKluge/TeenyTinyLlama-162m) fine-tuned on a translated version of the [HateBR dataset](https://huggingface.co/datasets/ruanchaves/hatebr).
+## Reproducing
 ```python
 from transformers import AutoTokenizer, DataCollatorWithPadding
 from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
+task=""
+model_name=""
 output_dir="checkpoint"
 learning_rate=5e-5
 per_device_train_batch_size=4
 save_strategy="epoch"
 hub_model_id="nicholasKluge/Teeny-tiny-llama-162m-hatebr"
 # Load the task
+dataset = load_dataset("ruanchaves/hatebr")
+# Format the dataset
 train = dataset['train'].to_pandas()
 train = train[['instagram_comments', 'offensive_language']]
 train.columns = ['text', 'labels']
 # Create a `ModelForSequenceClassification`
 model = AutoModelForSequenceClassification.from_pretrained(
+    "nicholasKluge/TeenyTinyLlama-162m",
     num_labels=2,
     id2label={0: "NEGATIVE", 1: "POSITIVE"},
     label2id={"NEGATIVE": 0, "POSITIVE": 1}
 )
+tokenizer = AutoTokenizer.from_pretrained("nicholasKluge/TeenyTinyLlama-162m")
+# Preprocess the dataset
 def preprocess_function(examples):
     return tokenizer(examples["text"], truncation=True)
 # Define training arguments
 training_args = TrainingArguments(
+    output_dir="checkpoints",
+    learning_rate=4e-5,
+    per_device_train_batch_size=16,
+    per_device_eval_batch_size=16,
+    num_train_epochs=3,
+    weight_decay=0.01,
+    evaluation_strategy="epoch",
+    save_strategy="epoch",
     load_best_model_at_end=True,
+    push_to_hub=True,
+    hub_token="your_token_here",
+    hub_model_id="username/model-ID",
 )
 # Define the Trainer