train model
Browse files- scripts/train_model.py +16 -10
scripts/train_model.py
CHANGED
@@ -16,16 +16,16 @@ from transformers import (
|
|
16 |
|
17 |
|
18 |
def _batch_iterator():
|
19 |
-
|
20 |
-
dataset = load_dataset('bigcode/programming-languages-keywords', split='train')
|
21 |
|
22 |
-
for row in dataset:
|
23 |
-
|
24 |
-
|
25 |
|
26 |
-
del dataset
|
27 |
-
gc.collect()
|
28 |
-
return
|
29 |
|
30 |
# code
|
31 |
dataset = (
|
@@ -187,7 +187,14 @@ def batch_iterator():
|
|
187 |
for text in _batch_iterator():
|
188 |
for i in range(0, len(text), 2048):
|
189 |
chunk = text[i:i + 2048]
|
190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
|
192 |
|
193 |
tokenizer = AutoTokenizer.from_pretrained('../')
|
@@ -241,7 +248,6 @@ training_args = TrainingArguments(
|
|
241 |
evaluation_strategy='no',
|
242 |
save_strategy='epoch',
|
243 |
torch_compile=True,
|
244 |
-
remove_unused_columns=False,
|
245 |
)
|
246 |
print(training_args)
|
247 |
|
|
|
16 |
|
17 |
|
18 |
def _batch_iterator():
|
19 |
+
## code
|
20 |
+
# dataset = load_dataset('bigcode/programming-languages-keywords', split='train')
|
21 |
|
22 |
+
# for row in dataset:
|
23 |
+
# for n in row['keywords']:
|
24 |
+
# yield n
|
25 |
|
26 |
+
# del dataset
|
27 |
+
# gc.collect()
|
28 |
+
# return
|
29 |
|
30 |
# code
|
31 |
dataset = (
|
|
|
187 |
for text in _batch_iterator():
|
188 |
for i in range(0, len(text), 2048):
|
189 |
chunk = text[i:i + 2048]
|
190 |
+
tokenized = tokenize_function(chunk)
|
191 |
+
yield tokenized
|
192 |
+
|
193 |
+
|
194 |
+
def tokenize_function(text):
|
195 |
+
outputs = tokenizer(text, truncation=True, padding='max_length', max_length=2048)
|
196 |
+
outputs['labels'] = outputs['input_ids'].copy()
|
197 |
+
return outputs
|
198 |
|
199 |
|
200 |
tokenizer = AutoTokenizer.from_pretrained('../')
|
|
|
248 |
evaluation_strategy='no',
|
249 |
save_strategy='epoch',
|
250 |
torch_compile=True,
|
|
|
251 |
)
|
252 |
print(training_args)
|
253 |
|