Upload JambaForCausalLM

Browse files

Files changed (11) hide show

README.md +2 -2
config.json +66 -0
generation_config.json +7 -0
model-00001-of-00007.safetensors +3 -0
model-00002-of-00007.safetensors +3 -0
model-00003-of-00007.safetensors +3 -0
model-00004-of-00007.safetensors +3 -0
model-00005-of-00007.safetensors +3 -0
model-00006-of-00007.safetensors +3 -0
model-00007-of-00007.safetensors +3 -0
model.safetensors.index.json +0 -0

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
 license: mit
 datasets:
 - teknium/OpenHermes-2.5
 pipeline_tag: text-generation
-tags:
-- jamba
 ---
 # PLACEHOLDER - Currently training. This is highly experimental and should be viewed as purely testing right now. Jamba has been very hard to train but I wanted to see how it did on one of the best datasets we have access to. I believe in transparent development so all *best* working iterations, even if they are a bit wonky, will be pushed here

 ---
 license: mit
+tags:
+- jamba
 datasets:
 - teknium/OpenHermes-2.5
 pipeline_tag: text-generation
 ---
 # PLACEHOLDER - Currently training. This is highly experimental and should be viewed as purely testing right now. Jamba has been very hard to train but I wanted to see how it did on one of the best datasets we have access to. I believe in transparent development so all *best* working iterations, even if they are a bit wonky, will be pushed here

config.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "_name_or_path": "ai21labs/Jamba-v0.1",
+  "architectures": [
+    "JambaForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "attn_layer_offset": 4,
+  "attn_layer_period": 8,
+  "auto_map": {
+    "AutoConfig": "ai21labs/Jamba-v0.1--configuration_jamba.JambaConfig",
+    "AutoModel": "ai21labs/Jamba-v0.1--modeling_jamba.JambaModel",
+    "AutoModelForCausalLM": "ai21labs/Jamba-v0.1--modeling_jamba.JambaForCausalLM",
+    "AutoModelForSequenceClassification": "ai21labs/Jamba-v0.1--model.JambaForSequenceClassification"
+  },
+  "bos_token_id": 1,
+  "calc_logits_for_entire_prompt": false,
+  "eos_token_id": 2,
+  "expert_layer_offset": 1,
+  "expert_layer_period": 2,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "mamba_conv_bias": true,
+  "mamba_d_conv": 4,
+  "mamba_d_state": 16,
+  "mamba_dt_rank": 256,
+  "mamba_expand": 2,
+  "mamba_inner_layernorms": true,
+  "mamba_proj_bias": false,
+  "model_type": "jamba",
+  "n_ctx": 262144,
+  "num_attention_heads": 32,
+  "num_experts": 16,
+  "num_experts_per_tok": 2,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "output_router_logits": false,
+  "pad_token_id": 0,
+  "quantization_config": {
+    "_load_in_4bit": true,
+    "_load_in_8bit": false,
+    "bnb_4bit_compute_dtype": "float32",
+    "bnb_4bit_quant_storage": "uint8",
+    "bnb_4bit_quant_type": "fp4",
+    "bnb_4bit_use_double_quant": false,
+    "llm_int8_enable_fp32_cpu_offload": false,
+    "llm_int8_has_fp16_weight": false,
+    "llm_int8_skip_modules": [
+      "mamba"
+    ],
+    "llm_int8_threshold": 6.0,
+    "load_in_4bit": true,
+    "load_in_8bit": false,
+    "quant_method": "bitsandbytes"
+  },
+  "rms_norm_eps": 1e-06,
+  "router_aux_loss_coef": 0.001,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.40.0.dev0",
+  "use_cache": true,
+  "use_mamba_kernels": true,
+  "vocab_size": 65536
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.40.0.dev0"
+}

model-00001-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fc887ecb74cdcab4dd760b13741e84c495be39eb311c9ae4ca181f0d3a7b5596
+size 4873003499

model-00002-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:01a96b353c80cb781570d786a1ecd0c78ab6a530b77471040b6c2a53aca594aa
+size 4984623421

model-00003-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c17daf2393e7fc4e1ea4ec9b357ed33d0a70b7fc24b5376a226bbd880e46147d
+size 4983413929

model-00004-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9492aa2ce2043f83e4c18075552a17b31cf33b533387bb52e15e5342660c10ea
+size 4997975699

model-00005-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:867fe81979ee53424e8e9b901830c206964fae0705b8d1f2a4bc332534e2b17b
+size 4983414017

model-00006-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4b11b8dc7e8642250f55cb44c427264d84eeae1c8b9d6f33426fbcbb0e267fb4
+size 4975844938

model-00007-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bf06e9efbea03f7189f806262b8174b0c27372de7b7ce912b5b2db9ba5682bf4
+size 3842901948

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff