File size: 3,368 Bytes
0178663 d2e65bc c88d27f d2e65bc 2ef38c1 d2e65bc 2ef38c1 d2e65bc 2ef38c1 d2e65bc 2ef38c1 41f51c5 2ef38c1 41f51c5 d2e65bc 2ef38c1 d2e65bc 2ef38c1 41f51c5 2ef38c1 d2e65bc 2ef38c1 d2e65bc 2ef38c1 d2e65bc 2ef38c1 41f51c5 2ef38c1 41f51c5 2ef38c1 41f51c5 2ef38c1 41f51c5 2ef38c1 d2e65bc 2ef38c1 d2e65bc 2ef38c1 41f51c5 2ef38c1 c88d27f 41f51c5 2ef38c1 c88d27f 2ef38c1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
---
license: apache-2.0
datasets:
- ruanchaves/hatebr
language:
- pt
metrics:
- accuracy
library_name: transformers
pipeline_tag: text-classification
tags:
- hate-speech
---
# TeenyTinyLlama-162m-HateBR
TeenyTinyLlama is a series of small foundational models trained on Portuguese.
This repository contains a version of [TeenyTinyLlama-162m](https://huggingface.co/nicholasKluge/TeenyTinyLlama-162m) fine-tuned on a translated version of the [HateBR dataset](https://huggingface.co/datasets/ruanchaves/hatebr).
## Reproducing
```python
# Hatebr
! pip install transformers datasets evaluate accelerate -q
import evaluate
import numpy as np
from huggingface_hub import login
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
task=""
model_name=""
output_dir="checkpoint"
learning_rate=5e-5
per_device_train_batch_size=4
per_device_eval_batch_size=4
num_train_epochs=3
weight_decay=0.01
evaluation_strategy="epoch"
save_strategy="epoch"
hub_model_id="nicholasKluge/Teeny-tiny-llama-162m-hatebr"
# Load the task
dataset = load_dataset("ruanchaves/hatebr")
# Format the dataset
train = dataset['train'].to_pandas()
train = train[['instagram_comments', 'offensive_language']]
train.columns = ['text', 'labels']
train.labels = train.labels.astype(int)
train = Dataset.from_pandas(train)
test = dataset['test'].to_pandas()
test = test[['instagram_comments', 'offensive_language']]
test.columns = ['text', 'labels']
test.labels = test.labels.astype(int)
test = Dataset.from_pandas(test)
dataset = DatasetDict({
"train": train,
"test": test
})
# Create a `ModelForSequenceClassification`
model = AutoModelForSequenceClassification.from_pretrained(
"nicholasKluge/TeenyTinyLlama-162m",
num_labels=2,
id2label={0: "NEGATIVE", 1: "POSITIVE"},
label2id={"NEGATIVE": 0, "POSITIVE": 1}
)
tokenizer = AutoTokenizer.from_pretrained("nicholasKluge/TeenyTinyLlama-162m")
# Preprocess the dataset
def preprocess_function(examples):
return tokenizer(examples["text"], truncation=True)
dataset_tokenized = dataset.map(preprocess_function, batched=True)
# Create a simple data collactor
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# Use accuracy as evaluation metric
accuracy = evaluate.load("accuracy")
# Function to compute accuracy
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return accuracy.compute(predictions=predictions, references=labels)
# Define training arguments
training_args = TrainingArguments(
output_dir="checkpoints",
learning_rate=4e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
push_to_hub=True,
hub_token="your_token_here",
hub_model_id="username/model-ID",
)
# Define the Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset_tokenized["train"],
eval_dataset=dataset_tokenized["test"],
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
# Train!
trainer.train()
```
|