CyberSolve-LinAlg-1.1 / config.json
MarioBarbeque's picture
We introduce CyberSolve, the flan-t5-large model fintuned on all 2M records of the DeepMind LinAlg 1D dataset. This is the first model checkpoint, scoring 86.56 on the eval dataset
71a7616 verified
raw
history blame contribute delete
875 Bytes
{
"_name_or_path": "/Volumes/workspace_dogfood/jgr/hugging_face_cache/CyberSolve-DeepMind-LinAlg-1D",
"architectures": [
"T5ForConditionalGeneration"
],
"classifier_dropout": 0.0,
"d_ff": 2816,
"d_kv": 64,
"d_model": 1024,
"decoder_start_token_id": 0,
"dense_act_fn": "gelu_new",
"dropout_rate": 0.1,
"eos_token_id": 1,
"feed_forward_proj": "gated-gelu",
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"is_gated_act": true,
"layer_norm_epsilon": 1e-06,
"model_type": "t5",
"n_positions": 512,
"num_decoder_layers": 24,
"num_heads": 16,
"num_layers": 24,
"output_past": true,
"pad_token_id": 0,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"tie_word_embeddings": false,
"torch_dtype": "float32",
"transformers_version": "4.47.1",
"use_cache": true,
"vocab_size": 32128
}