See axolotl config
axolotl version: 0.6.0
base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
# model_type: LlamaForCausalLM
# processing_class: AutoTokenizer
plugins:
- axolotl.integrations.liger.LigerPlugin
liger_rope: true
liger_rms_norm: true
liger_swiglu: true
liger_fused_linear_cross_entropy: true
load_in_8bit: false
load_in_4bit: true
strict: false
chat_template: tokenizer_default
datasets:
- path: PaxwellPaxwell/text_formatted_dataset_law_predict
type: completion # format from earlier
field: text
- path: PaxwellPaxwell/law_documents_civil_text_ready_train
type: completion # format from earlier
field: text
- path: PaxwellPaxwell/law_documents_criminal_text_ready_train
type: completion # format from earlier
field: text
# - path: PaxwellPaxwell/law_documents_criminal_text_2_ready_train
# type: completion # format from earlier
# field: text
- path: PaxwellPaxwell/law_documents_civil_qa_ready_v2_train
type: alpaca
- path: PaxwellPaxwell/law_documents_criminal_qa_ready_v2_train
type: alpaca
- path: PaxwellPaxwell/law_documents_civil_qa_ready_v2_train
type: chat_template
field_messages: messages
message_field_role: role
message_field_content: content
roles:
user:
- user
assistant:
- assistant
- path: PaxwellPaxwell/law_documents_criminal_qa_ready_v2_train
type: chat_template
field_messages: messages
message_field_role: role
message_field_content: content
roles:
user:
- user
assistant:
- assistant
- path: PaxwellPaxwell/law_documents_criminal_qa_ready_v3_train
type: alpaca
- path: PaxwellPaxwell/law_documents_civil_qa_ready_v3_train
type: alpaca
- path: PaxwellPaxwell/law_documents_criminal_qa_ready_v3.1_train
type: alpaca
- path: PaxwellPaxwell/law_documents_civil_qa_ready_v3.1_train
type: alpaca
- path: PaxwellPaxwell/law_documents_criminal_qa_ready_v3_reason
type: alpaca
- path: PaxwellPaxwell/law_documents_civil_qa_ready_v3_reason
type: alpaca
- path: PaxwellPaxwell/law_documents_civil_qa_ready_v3.1_reason
type: alpaca
- path: PaxwellPaxwell/law_documents_criminal_qa_ready_v3.1_reason
type: alpaca
- path: PaxwellPaxwell/law_documents_civil_qa_ready_v3.2_reason
type: alpaca
- path: PaxwellPaxwell/law_documents_criminal_qa_ready_v3.2_reason
type: alpaca
- path: PaxwellPaxwell/law_documents_civil_qa_ready_v3.3_reason
type: alpaca
- path: PaxwellPaxwell/law_documents_criminal_qa_ready_v3.3_reason
type: alpaca
- path: PaxwellPaxwell/law_documents_criminal_qa_ready_v3_train
type: chat_template
field_messages: messages
message_field_role: role
message_field_content: content
roles:
user:
- user
assistant:
- assistant
- path: PaxwellPaxwell/law_documents_civil_qa_ready_v3_train
type: chat_template
field_messages: messages
message_field_role: role
message_field_content: content
roles:
user:
- user
assistant:
- assistant
- path: PaxwellPaxwell/law_documents_criminal_qa_ready_v3.1_train
type: chat_template
field_messages: messages
message_field_role: role
message_field_content: content
roles:
user:
- user
assistant:
- assistant
- path: PaxwellPaxwell/law_documents_civil_qa_ready_v3.1_train
type: chat_template
field_messages: messages
message_field_role: role
message_field_content: content
roles:
user:
- user
assistant:
- assistant
- path: PaxwellPaxwell/law_documents_criminal_qa_ready_v3_reason
type: chat_template
field_messages: messages
message_field_role: role
message_field_content: content
roles:
user:
- user
assistant:
- assistant
- path: PaxwellPaxwell/law_documents_civil_qa_ready_v3_reason
type: chat_template
field_messages: messages
message_field_role: role
message_field_content: content
roles:
user:
- user
assistant:
- assistant
- path: PaxwellPaxwell/law_documents_civil_qa_ready_v3.1_reason
type: chat_template
field_messages: messages
message_field_role: role
message_field_content: content
roles:
user:
- user
assistant:
- assistant
- path: PaxwellPaxwell/law_documents_criminal_qa_ready_v3.1_reason
type: chat_template
field_messages: messages
message_field_role: role
message_field_content: content
roles:
user:
- user
assistant:
- assistant
- path: PaxwellPaxwell/law_documents_civil_qa_ready_v3.2_reason
type: chat_template
field_messages: messages
message_field_role: role
message_field_content: content
roles:
user:
- user
assistant:
- assistant
- path: PaxwellPaxwell/law_documents_criminal_qa_ready_v3.2_reason
type: chat_template
field_messages: messages
message_field_role: role
message_field_content: content
roles:
user:
- user
assistant:
- assistant
- path: PaxwellPaxwell/law_documents_civil_qa_ready_v3.3_reason
type: chat_template
field_messages: messages
message_field_role: role
message_field_content: content
roles:
user:
- user
assistant:
- assistant
- path: PaxwellPaxwell/law_documents_criminal_qa_ready_v3.3_reason
type: chat_template
field_messages: messages
message_field_role: role
message_field_content: content
roles:
user:
- user
assistant:
- assistant
dataset_prepared_path: ./workspace/aiLawData/last_run_prepared
val_set_size: 0.02
output_dir: ./workspace/aiLawData/outputs/qwen-32b-memo-law-Instruct-lora-r256-v1
hub_model_id: PaxwellPaxwell/qwen-32b-Memo-law-Instruct-adapter-lora-r256-v1
sequence_len: 12000
sample_packing: true
pad_to_sequence_len: true
adapter: lora
lora_model_dir:
lora_r: 256
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_fan_in_fan_out:
lora_target_modules:
- gate_proj
- down_proj
- up_proj
- q_proj
- v_proj
- k_proj
- o_proj
wandb_project: Ai-Law
wandb_entity:
wandb_watch:
wandb_name: qwen-32b-Memo-law-Instruct-adapter-lora-r256-v1
wandb_log_model:
gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 10
optimizer: paged_adamw_8bit
lr_scheduler: cosine
learning_rate: 0.0002
auto_resume_from_checkpoints: true
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false
gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
logging_steps: 1
xformers_attention:
flash_attention: true
warmup_steps: 100
evals_per_epoch: 2
eval_table_size:
saves_per_epoch: 2
debug:
deepspeed:
weight_decay: 0.0
fsdp:
fsdp_config:
special_tokens:
qwen-32b-Memo-law-Instruct-adapter-lora-r256-v1
This model is a fine-tuned version of deepseek-ai/DeepSeek-R1-Distill-Qwen-32B on the PaxwellPaxwell/text_formatted_dataset_law_predict, the PaxwellPaxwell/law_documents_civil_text_ready_train, the PaxwellPaxwell/law_documents_criminal_text_ready_train, the PaxwellPaxwell/law_documents_civil_qa_ready_v2_train, the PaxwellPaxwell/law_documents_criminal_qa_ready_v2_train, the PaxwellPaxwell/law_documents_civil_qa_ready_v2_train, the PaxwellPaxwell/law_documents_criminal_qa_ready_v2_train, the PaxwellPaxwell/law_documents_criminal_qa_ready_v3_train, the PaxwellPaxwell/law_documents_civil_qa_ready_v3_train, the PaxwellPaxwell/law_documents_criminal_qa_ready_v3.1_train, the PaxwellPaxwell/law_documents_civil_qa_ready_v3.1_train, the PaxwellPaxwell/law_documents_criminal_qa_ready_v3_reason, the PaxwellPaxwell/law_documents_civil_qa_ready_v3_reason, the PaxwellPaxwell/law_documents_civil_qa_ready_v3.1_reason, the PaxwellPaxwell/law_documents_criminal_qa_ready_v3.1_reason, the PaxwellPaxwell/law_documents_civil_qa_ready_v3.2_reason, the PaxwellPaxwell/law_documents_criminal_qa_ready_v3.2_reason, the PaxwellPaxwell/law_documents_civil_qa_ready_v3.3_reason, the PaxwellPaxwell/law_documents_criminal_qa_ready_v3.3_reason, the PaxwellPaxwell/law_documents_criminal_qa_ready_v3_train, the PaxwellPaxwell/law_documents_civil_qa_ready_v3_train, the PaxwellPaxwell/law_documents_criminal_qa_ready_v3.1_train, the PaxwellPaxwell/law_documents_civil_qa_ready_v3.1_train, the PaxwellPaxwell/law_documents_criminal_qa_ready_v3_reason, the PaxwellPaxwell/law_documents_civil_qa_ready_v3_reason, the PaxwellPaxwell/law_documents_civil_qa_ready_v3.1_reason, the PaxwellPaxwell/law_documents_criminal_qa_ready_v3.1_reason, the PaxwellPaxwell/law_documents_civil_qa_ready_v3.2_reason, the PaxwellPaxwell/law_documents_criminal_qa_ready_v3.2_reason, the PaxwellPaxwell/law_documents_civil_qa_ready_v3.3_reason and the PaxwellPaxwell/law_documents_criminal_qa_ready_v3.3_reason datasets. It achieves the following results on the evaluation set:
- Loss: 0.0747
Model description
More information needed
Intended uses & limitations
More information needed
Training and evaluation data
More information needed
Training procedure
Training hyperparameters
The following hyperparameters were used during training:
- learning_rate: 0.0002
- train_batch_size: 1
- eval_batch_size: 1
- seed: 42
- gradient_accumulation_steps: 4
- total_train_batch_size: 4
- optimizer: Use paged_adamw_8bit with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
- lr_scheduler_type: cosine
- lr_scheduler_warmup_steps: 100
- num_epochs: 10
Training results
Training Loss | Epoch | Step | Validation Loss |
---|---|---|---|
1.1799 | 0.0020 | 1 | 1.1722 |
0.6044 | 0.4995 | 253 | 0.6105 |
0.5819 | 0.9990 | 506 | 0.5499 |
0.4995 | 1.4975 | 759 | 0.5053 |
0.4317 | 1.9970 | 1012 | 0.4618 |
0.4027 | 2.4956 | 1265 | 0.4295 |
0.3853 | 2.9951 | 1518 | 0.3894 |
0.3118 | 3.4936 | 1771 | 0.3602 |
0.2108 | 3.9931 | 2024 | 0.3234 |
0.2733 | 2.1213 | 2277 | 0.3511 |
0.2498 | 2.3574 | 2530 | 0.3214 |
0.202 | 2.5934 | 2783 | 0.3020 |
0.1511 | 2.8295 | 3036 | 0.2784 |
0.0767 | 3.0662 | 3289 | 0.2523 |
0.1337 | 3.3023 | 3542 | 0.2431 |
0.0918 | 3.5384 | 3795 | 0.2190 |
0.0675 | 3.7744 | 4048 | 0.1992 |
0.0367 | 4.0112 | 4301 | 0.1823 |
0.0613 | 4.2473 | 4554 | 0.1644 |
0.0643 | 4.4833 | 4807 | 0.1569 |
0.0356 | 4.7194 | 5060 | 0.1406 |
0.0513 | 4.9554 | 5313 | 0.1237 |
0.0152 | 5.1922 | 5566 | 0.1159 |
0.0347 | 5.4329 | 5819 | 0.1069 |
0.0232 | 5.6690 | 6072 | 0.1012 |
0.0172 | 5.9051 | 6325 | 0.1077 |
0.0134 | 6.1390 | 6578 | 0.0939 |
0.0087 | 6.3751 | 6831 | 0.0873 |
0.0059 | 6.6111 | 7084 | 0.0827 |
0.0063 | 6.8500 | 7337 | 0.0785 |
0.0056 | 7.0840 | 7590 | 0.0778 |
0.0032 | 7.3200 | 7843 | 0.0760 |
0.0029 | 7.5561 | 8096 | 0.0753 |
0.0066 | 7.7922 | 8349 | 0.0738 |
0.0038 | 8.0289 | 8602 | 0.0738 |
0.004 | 8.2650 | 8855 | 0.0747 |
0.0025 | 8.5010 | 9108 | 0.0742 |
0.0057 | 8.7371 | 9361 | 0.0738 |
0.0024 | 8.9732 | 9614 | 0.0733 |
0.0015 | 9.2109 | 9867 | 0.0745 |
0.0017 | 9.4469 | 10120 | 0.0746 |
0.0027 | 9.6830 | 10373 | 0.0746 |
0.002 | 9.9191 | 10626 | 0.0747 |
Framework versions
- PEFT 0.14.0
- Transformers 4.47.1
- Pytorch 2.5.1+cu124
- Datasets 3.2.0
- Tokenizers 0.21.0
- Downloads last month
- 33
Model tree for PaxwellPaxwell/qwen2-32b-Memo-law-Instruct-adapter-lora-r256-v1
Base model
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B