Upload folder using huggingface_hub

426aeec verified 17 days ago

940 Bytes

	badam_mode: layer
	badam_switch_interval: 100
	badam_switch_mode: ascending
	badam_update_ratio: 0.1
	bf16: true
	cutoff_len: 4096
	dataset: smoltalk_chinese
	dataset_dir: data
	ddp_timeout: 180000000
	deepspeed: cache/ds_z3_config.json
	do_train: true
	eval_steps: 5000
	eval_strategy: steps
	finetuning_type: full
	flash_attn: fa2
	gradient_accumulation_steps: 4
	learning_rate: 0.0003
	logging_steps: 5
	lr_scheduler_type: cosine
	max_grad_norm: 2.0
	max_samples: 10000000
	model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
	neat_packing: true
	neftune_noise_alpha: 5
	num_train_epochs: 2.0
	optim: adamw_torch
	output_dir: saves/DeepSeek-R1-Distill-Qwen-1.5B/full/train_2025-01-23-00-42-56
	packing: true
	per_device_eval_batch_size: 4
	per_device_train_batch_size: 4
	plot_loss: true
	preprocessing_num_workers: 16
	report_to: none
	save_steps: 5000
	stage: sft
	template: deepseekr1
	trust_remote_code: true
	use_badam: true
	val_size: 0.001
	warmup_steps: 100