How can this be trained using trainer?
Getting
-> 2770 outputs = model(**inputs)
2771 # Save past state if it exists
2772 # TODO: this needs to be fixed and made cleaner later.
2773 if self.args.past_index >= 0:
File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs)
1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1517 else:
-> 1518 return self._call_impl(*args, **kwargs)
File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, **kwargs)
1522 # If we don't have any hooks, we want to skip the rest of the logic in
1523 # this function, and just call forward.
1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1525 or _global_backward_pre_hooks or _global_backward_hooks
1526 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1527 return forward_call(*args, **kwargs)
1529 try:
1530 result = None
File /usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py:680, in convert_outputs_to_fp32.<locals>.forward(*args, **kwargs)
679 def forward(*args, **kwargs):
--> 680 return model_forward(*args, **kwargs)
File /usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py:668, in ConvertOutputsToFp32.__call__(self, *args, **kwargs)
667 def __call__(self, *args, **kwargs):
--> 668 return convert_to_fp32(self.model_forward(*args, **kwargs))
File /usr/local/lib/python3.10/dist-packages/torch/amp/autocast_mode.py:16, in autocast_decorator.<locals>.decorate_autocast(*args, **kwargs)
13 @functools.wraps(func)
14 def decorate_autocast(*args, **kwargs):
15 with autocast_instance:
---> 16 return func(*args, **kwargs)
TypeError: MambaForCausalLM.forward() got an unexpected keyword argument 'attention_mask'
On code like:
model_name = "Q-bert/Mamba-130M"#"euclaise/falcon_1b_stage2"#"mosaicml/mpt-7b-storywriter"
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)
#...
training_args = TrainingArguments(
output_dir="./results",
max_steps=100000,
warmup_steps=500,
weight_decay=0.01,
logging_dir="./logs",
logging_steps=500,
fp16=True,
learning_rate=1e-5,
evaluation_strategy="steps", # Add this line
eval_steps=10000, # Add this line to run validation every 1000 steps
save_steps=10000,
neftune_noise_alpha=5,
)
trainer = Trainer(
model=model,
tokenizer=tokenizer,
train_dataset=tokenized_train_data["train"],
eval_dataset=tokenized_val_data["validation"],
args=training_args
)
trainer.train()
Also fails on SFTTrainer.
Yes so i'm working on this. Trainer does not support. I will solve but i am creating new trainer class . And this isn't gonna be happen
But there is an error in the my class, I am trying to solve it right now, I am having some problems with loss.
from transformers import Trainer ,TrainingArguments
import torch
import os
class MambaTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False):
input_ids = inputs.pop("input_ids")
lm_logits = model(input_ids)[0]
labels = input_ids.to(lm_logits.device)
shift_logits = lm_logits[:, :-1, :].contiguous()
labels = labels[:, 1:].contiguous()
loss_fct = torch.nn.CrossEntropyLoss()
lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), labels.view(-1))
return lm_loss
You can use this trainer but fp16 must be False else loss will be NaN or 0.
I will solve fp16 problem as soon as possible. I'm searching why i guess norm is problem. Like old t5 models.
Thank you! That is great news!=)
Tried it out... And oh boy it is Slooooooow to finetune on 4090... 200h...
vs 2h for similar sized gpt2, and even 8h for falcon-1b...
Tried it out... And oh boy it is Slooooooow to finetune on 4090... 200h...
vs 2h for similar sized gpt2, and even 8h for falcon-1b...
https://huggingface.co/Q-bert/Mamba-370M/discussions/1
I answered your question at here.
I've found that fine-tuning the mamba model using the approach you recommended works really well, but the only downside is that the training speed is too slow. It would be brilliant if you could speed it up in an update. Thannnnnnks ~♥!