# Project-wide configuration settings # Variables for train-test-split TRAIN_SIZE: 0.7 # General parameters max_len: 1000 num_tissues: 8 expressed_threshold: 0.1 random_seed: 766 dnabert: max_seq_len: 512 kmer: 6 test_size: 0.2 tokenizer: vocab_size: 5000 data: max_seq_len: 1000 test_size: 0.2 num_labels: 8 training: pretrain: num_train_epochs: 3 per_device_train_batch_size: 64 per_device_eval_batch_size: 64 fp16: true logging_steps: 50 eval_steps: 200 save_steps: 100 save_total_limit: 20 gradient_accumulation_steps: 25 learning_rate: 1.e-4 weight_decay: 0 adam_epsilon: 1.e-8 max_grad_norm: 10 warmup_steps: 50 optimizer: "lamb" scheduler: "linear" mlm_prob: 0.15 finetune: # num_train_epochs: 10 num_train_epochs: 3 per_device_train_batch_size: 64 per_device_eval_batch_size: 8 fp16: true logging_steps: 50 eval_steps: 500 save_steps: 500 save_total_limit: 10 # gradient_accumulation_steps: 1 gradient_accumulation_steps: 10 eval_accumulation_steps: 64 learning_rate: 1.e-3 # learning_rate: 1.e-1 # lr: 1.e-3 betas: - 0.9 - 0.999 eps: 1.e-8 weight_decay: 0 adam_epsilon: 1.e-8 max_grad_norm: 10 warmup_steps: 200 num_cooldown_steps: 2000 optimizer: "lamb" # optimizer: "adamw" # scheduler: "delay" scheduler: "constant" # num_param_groups: 0 # param_group_size: 2 # Except for the classification head, which has param_group_size == 1 delay_size: 0 models: roberta-base: num_attention_heads: 6 num_hidden_layers: 6 type_vocab_size: 1 block_size: 258 max_tokenized_len: 256 roberta-lm: {} roberta-pred: {} roberta-pred-mean-pool: hidden_dropout_prob: 0.2 output_mode: "regression" # For sparse (bce + mse) loss # output_mode: "sparse" threshold: 1 alpha: 0.1 dnabert-base: block_size: 512 max_tokenized_len: 510 dnabert-lm: {} dnabert-pred: {} dnabert-pred-mean-pool: hidden_dropout_prob: 0.2 output_mode: "regression"