time_start: null DEBUG: false debug_model: unsloth/Qwen2.5-7B-bnb-4bit fold: 0 random_seed: true train_on_all_folds: false eval_only: false merge_adapters: false wandb_id: null val_split_name: val pad_token: response_template_ids: - 4 num_proc: 20 hub_repo_tags: - odesia script_args: dataset_name: nbroad/odesia-dipromats-seq-cls-v1 config: t1 gradient_checkpointing_use_reentrant: true ignore_bias_buffers: false model_config: model_name_or_path: mistralai/Ministral-8B-Instruct-2410 torch_dtype: bfloat16 attn_implementation: flash_attention_2 use_peft: true lora_r: 16 lora_alpha: 32 lora_dropout: 0.05 lora_target_modules: - q_proj - v_proj - k_proj - o_proj - up_proj - down_proj - gate_proj lora_modules_to_save: null lora_task_type: SEQ_CLS use_rslora: true load_in_8bit: false load_in_4bit: false bnb_4bit_quant_type: nf4 use_bnb_nested_quant: true training_args: resume_from_checkpoint: null output_dir: ./ num_train_epochs: 1 per_device_train_batch_size: 8 per_device_eval_batch_size: 8 warmup_ratio: 0.1 fp16: false bf16: true eval_strategy: steps save_strategy: steps eval_steps: 10 save_steps: 10 save_total_limit: 2 logging_steps: 2 run_name: null weight_decay: 0.01 report_to: wandb learning_rate: 4.0e-05 metric_for_best_model: loss greater_is_better: false gradient_checkpointing: true gradient_accumulation_steps: 8 gradient_checkpointing_kwargs: use_reentrant: true optim: adamw_torch dataloader_num_workers: 4 seed: 18 max_grad_norm: 2.0 load_best_model_at_end: true push_to_hub: true hub_private_repo: true lr_scheduler_type: cosine remove_unused_columns: false ddp_find_unused_parameters: false use_liger_kernel: true