File size: 1,771 Bytes

9c13803

time_start: null
DEBUG: false
debug_model: unsloth/Qwen2.5-7B-bnb-4bit
fold: 0
random_seed: true
train_on_all_folds: false
eval_only: false
merge_adapters: false
wandb_id: null
val_split_name: val
pad_token: <pad>
response_template_ids:
- 4
num_proc: 20
hub_repo_tags:
- odesia
script_args:
  dataset_name: nbroad/odesia-dipromats-seq-cls-v1
  config: t1
  gradient_checkpointing_use_reentrant: true
  ignore_bias_buffers: false
model_config:
  model_name_or_path: mistralai/Ministral-8B-Instruct-2410
  torch_dtype: bfloat16
  attn_implementation: flash_attention_2
  use_peft: true
  lora_r: 16
  lora_alpha: 32
  lora_dropout: 0.05
  lora_target_modules:
  - q_proj
  - v_proj
  - k_proj
  - o_proj
  - up_proj
  - down_proj
  - gate_proj
  lora_modules_to_save: null
  lora_task_type: SEQ_CLS
  use_rslora: true
  load_in_8bit: false
  load_in_4bit: false
  bnb_4bit_quant_type: nf4
  use_bnb_nested_quant: true
training_args:
  resume_from_checkpoint: null
  output_dir: ./
  num_train_epochs: 1
  per_device_train_batch_size: 8
  per_device_eval_batch_size: 8
  warmup_ratio: 0.1
  fp16: false
  bf16: true
  eval_strategy: steps
  save_strategy: steps
  eval_steps: 10
  save_steps: 10
  save_total_limit: 2
  logging_steps: 2
  run_name: null
  weight_decay: 0.01
  report_to: wandb
  learning_rate: 4.0e-05
  metric_for_best_model: loss
  greater_is_better: false
  gradient_checkpointing: true
  gradient_accumulation_steps: 8
  gradient_checkpointing_kwargs:
    use_reentrant: true
  optim: adamw_torch
  dataloader_num_workers: 4
  seed: 18
  max_grad_norm: 2.0
  load_best_model_at_end: true
  push_to_hub: true
  hub_private_repo: true
  lr_scheduler_type: cosine
  remove_unused_columns: false
  ddp_find_unused_parameters: false
  use_liger_kernel: true