nicholasKluge commited on
Commit
2ef38c1
·
1 Parent(s): c88d27f

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +114 -4
README.md CHANGED
@@ -12,6 +12,14 @@ tags:
12
  - hate-speech
13
  ---
14
 
 
 
 
 
 
 
 
 
15
  ## nicholasKluge/Teeny-tiny-llama-162m-hatebr
16
 
17
  | Epoch | Training Loss | Validation Loss | Accuracy |
@@ -20,12 +28,114 @@ tags:
20
  | 2 | 0.129100 | 0.371028 | 0.905714 |
21
  | 3 | 0.019300 | 0.428130 | 0.907143 |
22
 
23
- ## bert-base-portuguese-cased-hatebr
24
 
25
  | Epoch | Training Loss | Validation Loss | Accuracy |
26
  |-------|---------------|------------------|----------|
27
- | 1 | 0.469500 | 0.529507 | 0.862143 |
28
- | 2 | 0.293200 | 0.383391 | 0.917857 |
29
- | 3 | 0.084900 | 0.429867 | 0.912857 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
 
31
 
 
 
12
  - hate-speech
13
  ---
14
 
15
+ ## bert-base-portuguese-cased-hatebr
16
+
17
+ | Epoch | Training Loss | Validation Loss | Accuracy |
18
+ |-------|---------------|------------------|----------|
19
+ | 1 | 0.469500 | 0.529507 | 0.862143 |
20
+ | 2 | 0.293200 | 0.383391 | 0.917857 |
21
+ | 3 | 0.084900 | 0.429867 | 0.912857 |
22
+
23
  ## nicholasKluge/Teeny-tiny-llama-162m-hatebr
24
 
25
  | Epoch | Training Loss | Validation Loss | Accuracy |
 
28
  | 2 | 0.129100 | 0.371028 | 0.905714 |
29
  | 3 | 0.019300 | 0.428130 | 0.907143 |
30
 
31
+ ## gpt2-small-portuguese-hatebr
32
 
33
  | Epoch | Training Loss | Validation Loss | Accuracy |
34
  |-------|---------------|------------------|----------|
35
+ | 1 | 0.475400 | 0.333722 | 0.864286 |
36
+ | 2 | 0.338800 | 0.550519 | 0.852143 |
37
+ | 3 | 0.207900 | 0.596878 | 0.874286 |
38
+
39
+
40
+ ```python
41
+
42
+ # Hatebr
43
+ ! pip install transformers datasets evaluate accelerate -q
44
+
45
+ import evaluate
46
+ import numpy as np
47
+ from huggingface_hub import login
48
+ from datasets import load_dataset, Dataset, DatasetDict
49
+ from transformers import AutoTokenizer, DataCollatorWithPadding
50
+ from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
51
+
52
+ token="your_token"
53
+ task="ruanchaves/hatebr"
54
+ model_name="neuralmind/bert-large-portuguese-cased"
55
+ output_dir="checkpoint"
56
+ learning_rate=5e-5
57
+ per_device_train_batch_size=4
58
+ per_device_eval_batch_size=4
59
+ num_train_epochs=3
60
+ weight_decay=0.01
61
+ evaluation_strategy="epoch"
62
+ save_strategy="epoch"
63
+ hub_model_id="nicholasKluge/gpt2-small-portuguese-hatebr"
64
+
65
+
66
+ login(token=token)
67
+
68
+ dataset = load_dataset(task)
69
+
70
+ train = dataset['train'].to_pandas()
71
+ train = train[['instagram_comments', 'offensive_language']]
72
+ train.columns = ['text', 'labels']
73
+ train.labels = train.labels.astype(int)
74
+ train = Dataset.from_pandas(train)
75
+
76
+ test = dataset['test'].to_pandas()
77
+ test = test[['instagram_comments', 'offensive_language']]
78
+ test.columns = ['text', 'labels']
79
+ test.labels = test.labels.astype(int)
80
+ test = Dataset.from_pandas(test)
81
+
82
+ dataset = DatasetDict({
83
+ "train": train,
84
+ "test": test
85
+ })
86
+
87
+ model = AutoModelForSequenceClassification.from_pretrained(
88
+ model_name,
89
+ num_labels=2,
90
+ id2label={0: "NEGATIVE", 1: "POSITIVE"},
91
+ label2id={"NEGATIVE": 0, "POSITIVE": 1}
92
+ )
93
+
94
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
95
+ #tokenizer.pad_token = tokenizer._eos_token
96
+ #model.config.pad_token_id = model.config.eos_token_id
97
+
98
+ def preprocess_function(examples):
99
+ return tokenizer(examples["text"], truncation=True)
100
+
101
+ dataset_tokenized = dataset.map(preprocess_function, batched=True)
102
+
103
+ data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
104
+
105
+ accuracy = evaluate.load("accuracy")
106
+
107
+ def compute_metrics(eval_pred):
108
+ predictions, labels = eval_pred
109
+ predictions = np.argmax(predictions, axis=1)
110
+ return accuracy.compute(predictions=predictions, references=labels)
111
+
112
+ training_args = TrainingArguments(
113
+ output_dir=output_dir,
114
+ learning_rate=learning_rate,
115
+ per_device_train_batch_size=per_device_train_batch_size,
116
+ per_device_eval_batch_size=per_device_eval_batch_size,
117
+ num_train_epochs=num_train_epochs,
118
+ weight_decay=weight_decay,
119
+ evaluation_strategy=evaluation_strategy,
120
+ save_strategy=save_strategy,
121
+ load_best_model_at_end=True,
122
+ push_to_hub=False,
123
+ hub_token=token,
124
+ hub_private_repo=True,
125
+ hub_model_id=hub_model_id,
126
+ tf32=False,
127
+ )
128
+
129
+ trainer = Trainer(
130
+ model=model,
131
+ args=training_args,
132
+ train_dataset=dataset_tokenized["train"],
133
+ eval_dataset=dataset_tokenized["test"],
134
+ tokenizer=tokenizer,
135
+ data_collator=data_collator,
136
+ compute_metrics=compute_metrics,
137
+ )
138
 
139
+ trainer.train()
140
 
141
+ ```