nicholasKluge commited on
Commit
d2e65bc
·
1 Parent(s): 41f51c5

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +22 -51
README.md CHANGED
@@ -11,31 +11,13 @@ pipeline_tag: text-classification
11
  tags:
12
  - hate-speech
13
  ---
 
14
 
15
- ## bert-base-portuguese-cased-hatebr
16
 
17
- | Epoch | Training Loss | Validation Loss | Accuracy |
18
- |-------|---------------|------------------|----------|
19
- | 1 | 0.469500 | 0.529507 | 0.862143 |
20
- | 2 | 0.293200 | 0.383391 | 0.917857 |
21
- | 3 | 0.084900 | 0.429867 | 0.912857 |
22
-
23
- ## nicholasKluge/Teeny-tiny-llama-162m-hatebr
24
-
25
- | Epoch | Training Loss | Validation Loss | Accuracy |
26
- |-------|---------------|------------------|----------|
27
- | 1 | 0.348100 | 0.296286 | 0.898571 |
28
- | 2 | 0.129100 | 0.371028 | 0.905714 |
29
- | 3 | 0.019300 | 0.428130 | 0.907143 |
30
-
31
- ## gpt2-small-portuguese-hatebr
32
-
33
- | Epoch | Training Loss | Validation Loss | Accuracy |
34
- |-------|---------------|------------------|----------|
35
- | 1 | 0.475400 | 0.333722 | 0.864286 |
36
- | 2 | 0.338800 | 0.550519 | 0.852143 |
37
- | 3 | 0.207900 | 0.596878 | 0.874286 |
38
 
 
39
 
40
  ```python
41
 
@@ -49,10 +31,8 @@ from datasets import load_dataset, Dataset, DatasetDict
49
  from transformers import AutoTokenizer, DataCollatorWithPadding
50
  from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
51
 
52
- # Basic fine-tuning arguments
53
- token="your_token"
54
- task="ruanchaves/hatebr"
55
- model_name="nicholasKluge/Teeny-tiny-llama-162m"
56
  output_dir="checkpoint"
57
  learning_rate=5e-5
58
  per_device_train_batch_size=4
@@ -63,13 +43,10 @@ evaluation_strategy="epoch"
63
  save_strategy="epoch"
64
  hub_model_id="nicholasKluge/Teeny-tiny-llama-162m-hatebr"
65
 
66
- # Login on the hub to load and push
67
- login(token=token)
68
-
69
  # Load the task
70
- dataset = load_dataset(task)
71
 
72
- # Preprocess if needed
73
  train = dataset['train'].to_pandas()
74
  train = train[['instagram_comments', 'offensive_language']]
75
  train.columns = ['text', 'labels']
@@ -89,19 +66,15 @@ dataset = DatasetDict({
89
 
90
  # Create a `ModelForSequenceClassification`
91
  model = AutoModelForSequenceClassification.from_pretrained(
92
- model_name,
93
  num_labels=2,
94
  id2label={0: "NEGATIVE", 1: "POSITIVE"},
95
  label2id={"NEGATIVE": 0, "POSITIVE": 1}
96
  )
97
 
98
- tokenizer = AutoTokenizer.from_pretrained(model_name)
99
-
100
- # If model does not have a pad_token, we need to add it
101
- #tokenizer.pad_token = tokenizer._eos_token
102
- #model.config.pad_token_id = model.config.eos_token_id
103
 
104
- # Pre process the dataset
105
  def preprocess_function(examples):
106
  return tokenizer(examples["text"], truncation=True)
107
 
@@ -121,20 +94,18 @@ def compute_metrics(eval_pred):
121
 
122
  # Define training arguments
123
  training_args = TrainingArguments(
124
- output_dir=output_dir,
125
- learning_rate=learning_rate,
126
- per_device_train_batch_size=per_device_train_batch_size,
127
- per_device_eval_batch_size=per_device_eval_batch_size,
128
- num_train_epochs=num_train_epochs,
129
- weight_decay=weight_decay,
130
- evaluation_strategy=evaluation_strategy,
131
- save_strategy=save_strategy,
132
  load_best_model_at_end=True,
133
- push_to_hub=False,
134
- hub_token=token,
135
- hub_private_repo=True,
136
- hub_model_id=hub_model_id,
137
- tf32=False,
138
  )
139
 
140
  # Define the Trainer
 
11
  tags:
12
  - hate-speech
13
  ---
14
+ # TeenyTinyLlama-162m-HateBR
15
 
16
+ TeenyTinyLlama is a series of small foundational models trained on Portuguese.
17
 
18
+ This repository contains a version of [TeenyTinyLlama-162m](https://huggingface.co/nicholasKluge/TeenyTinyLlama-162m) fine-tuned on a translated version of the [HateBR dataset](https://huggingface.co/datasets/ruanchaves/hatebr).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
+ ## Reproducing
21
 
22
  ```python
23
 
 
31
  from transformers import AutoTokenizer, DataCollatorWithPadding
32
  from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
33
 
34
+ task=""
35
+ model_name=""
 
 
36
  output_dir="checkpoint"
37
  learning_rate=5e-5
38
  per_device_train_batch_size=4
 
43
  save_strategy="epoch"
44
  hub_model_id="nicholasKluge/Teeny-tiny-llama-162m-hatebr"
45
 
 
 
 
46
  # Load the task
47
+ dataset = load_dataset("ruanchaves/hatebr")
48
 
49
+ # Format the dataset
50
  train = dataset['train'].to_pandas()
51
  train = train[['instagram_comments', 'offensive_language']]
52
  train.columns = ['text', 'labels']
 
66
 
67
  # Create a `ModelForSequenceClassification`
68
  model = AutoModelForSequenceClassification.from_pretrained(
69
+ "nicholasKluge/TeenyTinyLlama-162m",
70
  num_labels=2,
71
  id2label={0: "NEGATIVE", 1: "POSITIVE"},
72
  label2id={"NEGATIVE": 0, "POSITIVE": 1}
73
  )
74
 
75
+ tokenizer = AutoTokenizer.from_pretrained("nicholasKluge/TeenyTinyLlama-162m")
 
 
 
 
76
 
77
+ # Preprocess the dataset
78
  def preprocess_function(examples):
79
  return tokenizer(examples["text"], truncation=True)
80
 
 
94
 
95
  # Define training arguments
96
  training_args = TrainingArguments(
97
+ output_dir="checkpoints",
98
+ learning_rate=4e-5,
99
+ per_device_train_batch_size=16,
100
+ per_device_eval_batch_size=16,
101
+ num_train_epochs=3,
102
+ weight_decay=0.01,
103
+ evaluation_strategy="epoch",
104
+ save_strategy="epoch",
105
  load_best_model_at_end=True,
106
+ push_to_hub=True,
107
+ hub_token="your_token_here",
108
+ hub_model_id="username/model-ID",
 
 
109
  )
110
 
111
  # Define the Trainer