See axolotl config

axolotl version: 0.6.0

base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
# model_type: LlamaForCausalLM
# processing_class: AutoTokenizer

plugins:
  - axolotl.integrations.liger.LigerPlugin
liger_rope: true
liger_rms_norm: true
liger_swiglu: true
liger_fused_linear_cross_entropy: true

load_in_8bit: false
load_in_4bit: true
strict: false

chat_template: tokenizer_default
datasets:

  - path: PaxwellPaxwell/text_formatted_dataset_law_predict
    type: completion # format from earlier
    field: text

  - path: PaxwellPaxwell/law_documents_civil_text_ready_train
    type: completion # format from earlier
    field: text

  - path: PaxwellPaxwell/law_documents_criminal_text_ready_train
    type: completion # format from earlier
    field: text

  # - path: PaxwellPaxwell/law_documents_criminal_text_2_ready_train
  #   type: completion # format from earlier
  #   field: text

  - path: PaxwellPaxwell/law_documents_civil_qa_ready_v2_train
    type: alpaca

  - path: PaxwellPaxwell/law_documents_criminal_qa_ready_v2_train
    type: alpaca

  - path: PaxwellPaxwell/law_documents_civil_qa_ready_v2_train
    type: chat_template
    field_messages: messages
    message_field_role: role
    message_field_content: content
    roles:
      user:
        - user
      assistant:
        - assistant

  - path: PaxwellPaxwell/law_documents_criminal_qa_ready_v2_train
    type: chat_template
    field_messages: messages
    message_field_role: role
    message_field_content: content
    roles:
      user:
        - user
      assistant:
        - assistant

  - path: PaxwellPaxwell/law_documents_criminal_qa_ready_v3_train
    type: alpaca

  - path: PaxwellPaxwell/law_documents_civil_qa_ready_v3_train
    type: alpaca  

  - path: PaxwellPaxwell/law_documents_criminal_qa_ready_v3.1_train
    type: alpaca

  - path: PaxwellPaxwell/law_documents_civil_qa_ready_v3.1_train
    type: alpaca 

  - path: PaxwellPaxwell/law_documents_criminal_qa_ready_v3_reason
    type: alpaca

  - path: PaxwellPaxwell/law_documents_civil_qa_ready_v3_reason
    type: alpaca

  - path: PaxwellPaxwell/law_documents_civil_qa_ready_v3.1_reason
    type: alpaca
  
  - path: PaxwellPaxwell/law_documents_criminal_qa_ready_v3.1_reason
    type: alpaca

  - path: PaxwellPaxwell/law_documents_civil_qa_ready_v3.2_reason
    type: alpaca
  
  - path: PaxwellPaxwell/law_documents_criminal_qa_ready_v3.2_reason
    type: alpaca
  
  - path: PaxwellPaxwell/law_documents_civil_qa_ready_v3.3_reason
    type: alpaca
  
  - path: PaxwellPaxwell/law_documents_criminal_qa_ready_v3.3_reason
    type: alpaca

  - path: PaxwellPaxwell/law_documents_criminal_qa_ready_v3_train
    type: chat_template
    field_messages: messages
    message_field_role: role
    message_field_content: content
    roles:
      user:
        - user
      assistant:
        - assistant

  - path: PaxwellPaxwell/law_documents_civil_qa_ready_v3_train
    type: chat_template
    field_messages: messages
    message_field_role: role
    message_field_content: content
    roles:
      user:
        - user
      assistant:
        - assistant

  - path: PaxwellPaxwell/law_documents_criminal_qa_ready_v3.1_train
    type: chat_template
    field_messages: messages
    message_field_role: role
    message_field_content: content
    roles:
      user:
        - user
      assistant:
        - assistant

  - path: PaxwellPaxwell/law_documents_civil_qa_ready_v3.1_train
    type: chat_template
    field_messages: messages
    message_field_role: role
    message_field_content: content
    roles:
      user:
        - user
      assistant:
        - assistant

  - path: PaxwellPaxwell/law_documents_criminal_qa_ready_v3_reason
    type: chat_template
    field_messages: messages
    message_field_role: role
    message_field_content: content
    roles:
      user:
        - user
      assistant:
        - assistant

  - path: PaxwellPaxwell/law_documents_civil_qa_ready_v3_reason
    type: chat_template
    field_messages: messages
    message_field_role: role
    message_field_content: content
    roles:
      user:
        - user
      assistant:
        - assistant

  - path: PaxwellPaxwell/law_documents_civil_qa_ready_v3.1_reason
    type: chat_template
    field_messages: messages
    message_field_role: role
    message_field_content: content
    roles:
      user:
        - user
      assistant:
        - assistant

  - path: PaxwellPaxwell/law_documents_criminal_qa_ready_v3.1_reason
    type: chat_template
    field_messages: messages
    message_field_role: role
    message_field_content: content
    roles:
      user:
        - user
      assistant:
        - assistant

  - path: PaxwellPaxwell/law_documents_civil_qa_ready_v3.2_reason
    type: chat_template
    field_messages: messages
    message_field_role: role
    message_field_content: content
    roles:
      user:
        - user
      assistant:
        - assistant

  - path: PaxwellPaxwell/law_documents_criminal_qa_ready_v3.2_reason
    type: chat_template
    field_messages: messages
    message_field_role: role
    message_field_content: content
    roles:
      user:
        - user
      assistant:
        - assistant

  - path: PaxwellPaxwell/law_documents_civil_qa_ready_v3.3_reason
    type: chat_template
    field_messages: messages
    message_field_role: role
    message_field_content: content
    roles:
      user:
        - user
      assistant:
        - assistant

  - path: PaxwellPaxwell/law_documents_criminal_qa_ready_v3.3_reason
    type: chat_template
    field_messages: messages
    message_field_role: role
    message_field_content: content
    roles:
      user:
        - user
      assistant:
        - assistant

dataset_prepared_path: ./workspace/aiLawData/last_run_prepared
val_set_size: 0.02
output_dir: ./workspace/aiLawData/outputs/qwen-32b-memo-law-Instruct-lora-r256-v1
hub_model_id: PaxwellPaxwell/qwen-32b-Memo-law-Instruct-adapter-lora-r256-v1
sequence_len: 12000
sample_packing: true
pad_to_sequence_len: true

adapter: lora
lora_model_dir:
lora_r: 256
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_fan_in_fan_out:
lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj

wandb_project: Ai-Law
wandb_entity:
wandb_watch:
wandb_name: qwen-32b-Memo-law-Instruct-adapter-lora-r256-v1
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 10
optimizer: paged_adamw_8bit
lr_scheduler: cosine
learning_rate: 0.0002
auto_resume_from_checkpoints: true

train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false

gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
logging_steps: 1
xformers_attention:
flash_attention: true

warmup_steps: 100
evals_per_epoch: 2
eval_table_size:
saves_per_epoch: 2
debug:
deepspeed:
weight_decay: 0.0
fsdp:
fsdp_config:
special_tokens:

qwen-32b-Memo-law-Instruct-adapter-lora-r256-v1

This model is a fine-tuned version of deepseek-ai/DeepSeek-R1-Distill-Qwen-32B on the PaxwellPaxwell/text_formatted_dataset_law_predict, the PaxwellPaxwell/law_documents_civil_text_ready_train, the PaxwellPaxwell/law_documents_criminal_text_ready_train, the PaxwellPaxwell/law_documents_civil_qa_ready_v2_train, the PaxwellPaxwell/law_documents_criminal_qa_ready_v2_train, the PaxwellPaxwell/law_documents_civil_qa_ready_v2_train, the PaxwellPaxwell/law_documents_criminal_qa_ready_v2_train, the PaxwellPaxwell/law_documents_criminal_qa_ready_v3_train, the PaxwellPaxwell/law_documents_civil_qa_ready_v3_train, the PaxwellPaxwell/law_documents_criminal_qa_ready_v3.1_train, the PaxwellPaxwell/law_documents_civil_qa_ready_v3.1_train, the PaxwellPaxwell/law_documents_criminal_qa_ready_v3_reason, the PaxwellPaxwell/law_documents_civil_qa_ready_v3_reason, the PaxwellPaxwell/law_documents_civil_qa_ready_v3.1_reason, the PaxwellPaxwell/law_documents_criminal_qa_ready_v3.1_reason, the PaxwellPaxwell/law_documents_civil_qa_ready_v3.2_reason, the PaxwellPaxwell/law_documents_criminal_qa_ready_v3.2_reason, the PaxwellPaxwell/law_documents_civil_qa_ready_v3.3_reason, the PaxwellPaxwell/law_documents_criminal_qa_ready_v3.3_reason, the PaxwellPaxwell/law_documents_criminal_qa_ready_v3_train, the PaxwellPaxwell/law_documents_civil_qa_ready_v3_train, the PaxwellPaxwell/law_documents_criminal_qa_ready_v3.1_train, the PaxwellPaxwell/law_documents_civil_qa_ready_v3.1_train, the PaxwellPaxwell/law_documents_criminal_qa_ready_v3_reason, the PaxwellPaxwell/law_documents_civil_qa_ready_v3_reason, the PaxwellPaxwell/law_documents_civil_qa_ready_v3.1_reason, the PaxwellPaxwell/law_documents_criminal_qa_ready_v3.1_reason, the PaxwellPaxwell/law_documents_civil_qa_ready_v3.2_reason, the PaxwellPaxwell/law_documents_criminal_qa_ready_v3.2_reason, the PaxwellPaxwell/law_documents_civil_qa_ready_v3.3_reason and the PaxwellPaxwell/law_documents_criminal_qa_ready_v3.3_reason datasets. It achieves the following results on the evaluation set:

Loss: 0.0747

Model description

More information needed

Intended uses & limitations

More information needed

Training and evaluation data

More information needed

Training procedure

Training hyperparameters

The following hyperparameters were used during training:

learning_rate: 0.0002
train_batch_size: 1
eval_batch_size: 1
seed: 42
gradient_accumulation_steps: 4
total_train_batch_size: 4
optimizer: Use paged_adamw_8bit with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
lr_scheduler_type: cosine
lr_scheduler_warmup_steps: 100
num_epochs: 10

Training results

Training Loss	Epoch	Step	Validation Loss
1.1799	0.0020	1	1.1722
0.6044	0.4995	253	0.6105
0.5819	0.9990	506	0.5499
0.4995	1.4975	759	0.5053
0.4317	1.9970	1012	0.4618
0.4027	2.4956	1265	0.4295
0.3853	2.9951	1518	0.3894
0.3118	3.4936	1771	0.3602
0.2108	3.9931	2024	0.3234
0.2733	2.1213	2277	0.3511
0.2498	2.3574	2530	0.3214
0.202	2.5934	2783	0.3020
0.1511	2.8295	3036	0.2784
0.0767	3.0662	3289	0.2523
0.1337	3.3023	3542	0.2431
0.0918	3.5384	3795	0.2190
0.0675	3.7744	4048	0.1992
0.0367	4.0112	4301	0.1823
0.0613	4.2473	4554	0.1644
0.0643	4.4833	4807	0.1569
0.0356	4.7194	5060	0.1406
0.0513	4.9554	5313	0.1237
0.0152	5.1922	5566	0.1159
0.0347	5.4329	5819	0.1069
0.0232	5.6690	6072	0.1012
0.0172	5.9051	6325	0.1077
0.0134	6.1390	6578	0.0939
0.0087	6.3751	6831	0.0873
0.0059	6.6111	7084	0.0827
0.0063	6.8500	7337	0.0785
0.0056	7.0840	7590	0.0778
0.0032	7.3200	7843	0.0760
0.0029	7.5561	8096	0.0753
0.0066	7.7922	8349	0.0738
0.0038	8.0289	8602	0.0738
0.004	8.2650	8855	0.0747
0.0025	8.5010	9108	0.0742
0.0057	8.7371	9361	0.0738
0.0024	8.9732	9614	0.0733
0.0015	9.2109	9867	0.0745
0.0017	9.4469	10120	0.0746
0.0027	9.6830	10373	0.0746
0.002	9.9191	10626	0.0747

Framework versions

PEFT 0.14.0
Transformers 4.47.1
Pytorch 2.5.1+cu124
Datasets 3.2.0
Tokenizers 0.21.0

PaxwellPaxwell
/

qwen2-32b-Memo-law-Instruct-adapter-lora-r256-v1

qwen-32b-Memo-law-Instruct-adapter-lora-r256-v1

Model description

Intended uses & limitations

Training and evaluation data

Training procedure

Training hyperparameters

Training results

Framework versions

Model tree for PaxwellPaxwell/qwen2-32b-Memo-law-Instruct-adapter-lora-r256-v1

Datasets used to train PaxwellPaxwell/qwen2-32b-Memo-law-Instruct-adapter-lora-r256-v1

Evaluation results