nbroad's picture
Training in progress, step 10
9c13803 verified
raw
history blame
1.77 kB
time_start: null
DEBUG: false
debug_model: unsloth/Qwen2.5-7B-bnb-4bit
fold: 0
random_seed: true
train_on_all_folds: false
eval_only: false
merge_adapters: false
wandb_id: null
val_split_name: val
pad_token: <pad>
response_template_ids:
- 4
num_proc: 20
hub_repo_tags:
- odesia
script_args:
dataset_name: nbroad/odesia-dipromats-seq-cls-v1
config: t1
gradient_checkpointing_use_reentrant: true
ignore_bias_buffers: false
model_config:
model_name_or_path: mistralai/Ministral-8B-Instruct-2410
torch_dtype: bfloat16
attn_implementation: flash_attention_2
use_peft: true
lora_r: 16
lora_alpha: 32
lora_dropout: 0.05
lora_target_modules:
- q_proj
- v_proj
- k_proj
- o_proj
- up_proj
- down_proj
- gate_proj
lora_modules_to_save: null
lora_task_type: SEQ_CLS
use_rslora: true
load_in_8bit: false
load_in_4bit: false
bnb_4bit_quant_type: nf4
use_bnb_nested_quant: true
training_args:
resume_from_checkpoint: null
output_dir: ./
num_train_epochs: 1
per_device_train_batch_size: 8
per_device_eval_batch_size: 8
warmup_ratio: 0.1
fp16: false
bf16: true
eval_strategy: steps
save_strategy: steps
eval_steps: 10
save_steps: 10
save_total_limit: 2
logging_steps: 2
run_name: null
weight_decay: 0.01
report_to: wandb
learning_rate: 4.0e-05
metric_for_best_model: loss
greater_is_better: false
gradient_checkpointing: true
gradient_accumulation_steps: 8
gradient_checkpointing_kwargs:
use_reentrant: true
optim: adamw_torch
dataloader_num_workers: 4
seed: 18
max_grad_norm: 2.0
load_best_model_at_end: true
push_to_hub: true
hub_private_repo: true
lr_scheduler_type: cosine
remove_unused_columns: false
ddp_find_unused_parameters: false
use_liger_kernel: true