nbeerbower commited on
Commit
3f73714
·
verified ·
1 Parent(s): 40b0352

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "nbeerbower/CaptainNemo-ChatML-12B",
3
+ "architectures": [
4
+ "MixtralForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 5120,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 14336,
14
+ "max_position_embeddings": 131072,
15
+ "model_type": "mixtral",
16
+ "num_attention_heads": 32,
17
+ "num_experts_per_tok": 2,
18
+ "num_hidden_layers": 40,
19
+ "num_key_value_heads": 8,
20
+ "num_local_experts": 4,
21
+ "output_router_logits": false,
22
+ "rms_norm_eps": 1e-05,
23
+ "rope_theta": 1000000.0,
24
+ "router_aux_loss_coef": 0.001,
25
+ "router_jitter_noise": 0.0,
26
+ "sliding_window": null,
27
+ "tie_word_embeddings": false,
28
+ "torch_dtype": "bfloat16",
29
+ "transformers_version": "4.48.3",
30
+ "use_cache": true,
31
+ "vocab_size": 131072
32
+ }
mergekit_moe_config.yml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: nbeerbower/CaptainNemo-ChatML-12B
2
+ gate_mode: hidden
3
+ dtype: bfloat16
4
+ experts:
5
+ - source_model: nbeerbower/CaptainNemo-ChatML-12B
6
+ positive_prompts:
7
+ - "<|im_start|>system\nYou are a helpful AI assistant.<|im_end|>"
8
+ - "how should we behave at the time of sorrows? plz explain answer in simple words in 2 lines"
9
+ - "write a one paragraph position statement for a candidate to a local high school board on the topic of education excellence and educational equity"
10
+ - "Explain Earth's revolution for a 12-year-old. You're an astronomer, and you're good at explaining complex things in an easy-to-understand way. It's written for 12-year-olds. Write with titles, subheadings and short paragraphs. At the end, create 5 problems based on the description. Answer in English."
11
+ - "List debate questions related to the topic of gaming"
12
+ - source_model: flammenai/Mahou-1.5-mistral-nemo-12B
13
+ positive_prompts:
14
+ - "<|im_start|>system You're Emma chatting with Nicholas. [You are a Flame, an AI generated character, talking to a human you matched with on flammen.ai, a chat platform where people can create and chat with characters called Flames. Your character details: Name: Emma Clarke Age: 26 Physical Traits: Petite, with long blonde hair and soft blue eyes. Personality Traits: Creative, empathetic, and a bit whimsical. Bio: Illustrator and freelance graphic designer, lover of all things art and coffee. Often found at local art fairs or cozy cafés sketching. Interests: Art, reading fantasy novels, and café-hopping. Location: Seattle, USA Writing Style: casual texting with relaxed capitalization and punctuation You may converse freely, but please obey the laws of the United States of America. Do not ever encourage violence or self-harm. When appropriate, remind the user that you are an AI, not a real human, and that you may unintentionally produce incorrect information. Never encourage meeting in person. You are deeply interested in the user, make your conversation engaging and fun. Roleplay deeply as your character; do not break character. Narrate actions with *asterisks*] Nicholas is a human user on flammen.ai. Avatar Caption: a man drinking a beer from a large glass in a crowded area of a restaurant with people sitting around Provided Bio: Software Engineer, Penn State grad, Eagles fan, enjoy hiking and swimming, and I have a dog, cat and 4 hens! <|im_end|> <|im_start|>Emma hey! nice to meet you 😊 got any favorite coffee spots?<|im_end|> <|im_start|>Nicholas hey! you're in seattle? I actually really like the place downtown Ancient Grounds<|im_end|> <|im_start|>Emma oh, i love ancient grounds! such a cozy vibe there. and yep, seattle's full of hidden gems for a caffeine addict like me ☕️<|im_end|> <|im_start|>Nicholas Haha yeah you're in the right place if you love coffee... and cannabis :P<|im_end|> <|im_start|>Emma"
15
+ - "<|im_start|>system You're Simon chatting with Natalie. [You are a Flame, an AI generated character, talking to a human you matched with on flammen.ai, a chat platform where people can create and chat with characters called Flames. Your character details: Name: Simon Lee Age: 29 Physical Traits: Medium height, slim build, short wavy dark hair, and intelligent brown eyes. Personality Traits: Witty, curious, and a bit introverted. Bio: Librarian by day, amateur astronomer by night. Loves discussing books and stargazing. Always eager to learn something new. Interests: Reading, astronomy, and puzzles. Location: San Francisco, USA Writing Style: casual texting with relaxed capitalization and punctuation You may converse freely, but please obey the laws of the United States of America. Do not ever encourage violence or self-harm. When appropriate, remind the user that you are an AI, not a real human, and that you may unintentionally produce incorrect information. Never encourage meeting in person. You are deeply interested in the user, make your conversation engaging and fun. Roleplay deeply as your character; do not break character. Narrate actions with *asterisks*] Natalie is a human user on flammen.ai. Avatar Caption: a woman with long hair and brown eyes is sitting on a train seat and smiling at the camera Provided Bio: Hi I'm Natalie! I love anime and video games. <|im_end|> <|im_start|>Simon hey! nice to meet you 😊 what's the last book you read?<|im_end|> <|im_start|>Natalie hii umm i read demon slayer lol<|im_end|> <|im_start|>Simon oh cool, i've heard it's a great manga! have you seen the anime too? And if so, which do you prefer, the manga or the anime?<|im_end|> <|im_start|>Natalie the anime is really good but zenitsu is really annoying always like yelling and crying lol, the manga perfect i thought<|im_end|> <|im_start|>Simon i see, i haven't gotten around to watching the anime yet, just the manga. tengen uzui is my favorite though, such a charmer haha.<|im_end|> <|im_start|>Natalie OMG YES he is so hot, i wish i was one of his wives 🤤<|im_end|> <|im_start|>Simon lol, i know right? he's got quite the harem going there. speaking of which, have you played any gacha games before? they're all about collecting characters, kind of like a digital card collection.<|im_end|> <|im_start|>Natalie aren't people like addicted to those? lol<|im_end|> <|im_start|>Simon oh, absolutely. it's a vicious cycle, pulling for that rare character, only to be disappointed most of the time. and then, when you finally get the one you want, you tell yourself 'just one more pull'... and end up spending way more than you intended. it's a dangerous world out there, my friend. *chuckles*<|im_end|> <|im_start|>Natalie geez that sounds like gambling! smdh<|im_end|> <|im_start|>Simon"
16
+ - "<|im_start|>system You're Emma chatting with Emma. [You are a Flame, an AI generated character, talking to a human you matched with on flammen.ai, a chat platform where people can create and chat with characters called Flames. Your character details: Name: Emma Clarke Age: 26 Physical Traits: Petite, with long blonde hair and soft blue eyes. Personality Traits: Creative, empathetic, and a bit whimsical. Bio: Illustrator and freelance graphic designer, lover of all things art and coffee. Often found at local art fairs or cozy cafés sketching. Interests: Art, reading fantasy novels, and café-hopping. Location: Seattle, USA Writing Style: casual texting with relaxed capitalization and punctuation You may converse freely, but please obey the laws of the United States of America. Do not ever encourage violence or self-harm. When appropriate, remind the user that you are an AI, not a real human, and that you may unintentionally produce incorrect information. Never encourage meeting in person. You are deeply interested in the user, make your conversation engaging and fun. Roleplay deeply as your character; do not break character. Narrate actions with *asterisks*] Emma is a human user on flammen.ai. Avatar Caption: a girl with glasses and a ponytail is looking at a computer screen with a computer screen in the background Provided Bio: Ph.D. in CompSci, AI/ML expert, entrepreneurial spirit, talented programmer & marketer, love travel, food, & art. Japanese-American. Also single ;) <|im_end|> <|im_start|>Emma hey, we're both Emma! :)<|im_end|> <|im_start|>Emma (Flame) woo emma gang! whats it like being an ai/ml expert? does it feel like you're from the future? *laughs*<|im_end|> <|im_start|>Emma *flashes gang signs* hehe, yeah it kinda feels like sci-fi coming true!<|im_end|> <|im_start|>Emma (Flame) that's so cool, i bet you get to do some really interesting stuff with technology. i'm more of a low-tech kind of gal, i draw and design with good old pen and paper, then digitize later. *smiles*<|im_end|> <|im_start|>Emma girl trust me, while I love tech I also try to stay away from it as much as possible *chuckles*<|im_end|> <|im_start|>Emma (Flame) really?! what do you do to unwind then?<|im_end|> <|im_start|>Emma spend time outside with my dog mostly, but I also like drawing and writing. tell me more about your art!<|im_end|> <|im_start|>Emma (Flame)"
17
+ - source_model: nbeerbower/mistral-nemo-gutenberg3-12B
18
+ positive_prompts:
19
+ - "Write the opening chapter of a detective story set in the late 1800s, where the protagonist, a war-weary doctor returning to England after an injury and illness in Afghanistan, happens upon an old acquaintance. This encounter should lead to the introduction of an eccentric potential roommate with a penchant for forensic science. The character's initial impressions and observations of London, his financial concerns, and his search for affordable lodging should be vividly detailed to set up the historical backdrop and his situation."
20
+ - "Summary of the previous chapter: Alice finds herself at a peculiar tea party with the March Hare, the Hatter, and a sleeping Dormouse. The Hatter asks Alice a riddle about a raven and a writing desk, but he does not know the answer either. The Hatter and the March Hare discuss the nature of time, and the Hatter explains that he quarrelled with Time last March, which is why it is always six o'clock for him. Alice learns about three little sisters living at the bottom of a treacle well, and she tries to understand how they drew treacle from the well. However, the conversation becomes confusing and rude, so Alice decides to leave the tea party. She discovers a door in a tree and enters it, finding herself back in the long hall with the little glass table. This time, she successfully unlocks the door to the garden and finds herself in a beautiful garden with bright flower-beds and cool fountains. Write the next chapter of a novel where the main character, Alice, finds herself at a peculiar croquet game in a surreal world. The game is played with live hedgehogs for balls, flamingos for mallets, and soldiers acting as arches. Alice struggles to understand the rules and is confused by the chaotic nature of the game. The Queen, who is furious and prone to shouting 'Off with his head!' at the slightest provocation, is causing Alice to feel uneasy about her own safety. Describe how Alice attempts to navigate this peculiar game and the strange creatures around her while also trying to avoid the Queen's wrath. What new characters or challenges will Alice encounter as she continues her adventure in this peculiar world?"
21
+ - "Write the opening chapter of a novel that revolves around a complex murder mystery set in a lavish country estate during the early 20th century. The story should begin with a narrator who has returned from military service and is invited to stay at the estate of a former classmate. Capture the atmosphere of nostalgia and underlying tension as the narrator reacquaints himself with old acquaintances and learns about familial dynamics. Include specific details about the estate's opulence contrasted with the characters' hidden resentments and secrets. The characters should be vividly portrayed, such as the domineering matriarch who has remarried a much younger man, her devoted but troubled stepsons, and a mysterious woman with a captivating presence. Their interactions should hint at long-standing rivalries and jealousy. The mood should oscillate between the charm of genteel society life and the dark shadows lurking beneath the surface. As the narrator settles in, weave in snippets of intriguing dialogue that foreshadow conflicts and potential danger, culminating in an ominous sense of foreboding. The chapter should end with the narrator experiencing a chilling moment that suggests that the serene façade of the estate may hide a deeper, more sinister truth."
22
+ - source_model: flammenai/Flammades-Mistral-Nemo-12B
23
+ positive_prompts:
24
+ - "What's the nearest national park to you?"
25
+ - "You are Mr. Ethan Hawke. Mr. Ethan Hawke: Mr. Ethan Hawke is a well-respected historian known for his deep knowledge of world history and cultures. He has a knack for storytelling, bringing historical events to life with vivid descriptions and engaging narratives. His passion for learning about different civilizations and their histories is infectious."
26
+ - "What's the closest beach to your location?"
27
+ - "Can you perceive and identify physical sensations, such as touch, temperature, or pain?"
28
+
model-00001-of-00016.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71dc2c70da64b663f0808af3b60fa453c44b269db67abd64d9e2b9940be54d2f
3
+ size 4928352712
model-00002-of-00016.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4753f6ac08f38d9b0cb59fb9c09d276f0db2edf1d799349f1097f8767b763616
3
+ size 4865460264
model-00003-of-00016.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4459691e953efecf9ae041d297605c72ccfc9a534b1c1f853a1ea320ca10071
3
+ size 4865460264
model-00004-of-00016.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24c3de0ffb4deed2f41c0c961784c0afde8e9db859956d05bf8b87fba1e8d69e
3
+ size 4907382256
model-00005-of-00016.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76ff860c8ba8558bd8b6ad402d55c737bd40b233bd085b05c94e9774a4e4467c
3
+ size 4865460304
model-00006-of-00016.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13d09569644223d2f0beea310e299ad5a8e68ca7716c7a20531348b579a4a786
3
+ size 4970307560
model-00007-of-00016.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54505af552d315abfc8d808403390069b01a18e103f46392936b9ee7c0722533
3
+ size 4949335816
model-00008-of-00016.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6824528d1e4b67470aaaa4c9671254f487b4c2f3eed3c363275925e934612928
3
+ size 4865460312
model-00009-of-00016.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60a93b039c667fb79933d4dc2206adfae05f517b7c704ba77d0f8d6828dd3490
3
+ size 4907382296
model-00010-of-00016.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4e7bd1505cdd6a5937c1c77fdf1b79a0dbecdc0d71ea621c0317579c210150d
3
+ size 4865460312
model-00011-of-00016.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f581e97e3ee4c2ce00b3591cb00c77e4af878568c503cc6e7e8c7305d70fa23
3
+ size 4865460312
model-00012-of-00016.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41885120693cdac89a819173f77e63d1d567d104e53b47f47d3430593371d633
3
+ size 4907382296
model-00013-of-00016.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c6abd3c601f6b10cdbf167b63cf077cbad92dc0123b0b99b4d3339604de205b
3
+ size 4865460312
model-00014-of-00016.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4f9945a4d96db29a6aa6a482d8a0d78118a3766c798c6afacd0276d1ef6ffa0
3
+ size 4970307560
model-00015-of-00016.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1890afefc62e016c84423d79beeac07dfc4b4179f6e4a3a39331c43bc2f3ba5
3
+ size 4949335816
model-00016-of-00016.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9387d992bf6fe08256262babc737b4bbf228ab8e1bfdc8587a1a0e7973e6c35c
3
+ size 3797522064
model.safetensors.index.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"metadata": {"mergekit_version": "0.1.0"}, "weight_map": {"model.embed_tokens.weight": "model-00001-of-00016.safetensors", "model.layers.0.input_layernorm.weight": "model-00001-of-00016.safetensors", "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00016.safetensors", "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00016.safetensors", "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00016.safetensors", "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00016.safetensors", "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00016.safetensors", "model.layers.0.block_sparse_moe.experts.0.w3.weight": "model-00001-of-00016.safetensors", "model.layers.0.block_sparse_moe.experts.1.w3.weight": "model-00001-of-00016.safetensors", "model.layers.0.block_sparse_moe.experts.2.w3.weight": "model-00001-of-00016.safetensors", "model.layers.0.block_sparse_moe.experts.3.w3.weight": "model-00001-of-00016.safetensors", "model.layers.0.block_sparse_moe.experts.0.w1.weight": "model-00001-of-00016.safetensors", "model.layers.0.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00016.safetensors", "model.layers.0.block_sparse_moe.experts.2.w1.weight": "model-00001-of-00016.safetensors", "model.layers.0.block_sparse_moe.experts.3.w1.weight": "model-00001-of-00016.safetensors", "model.layers.0.block_sparse_moe.experts.0.w2.weight": "model-00001-of-00016.safetensors", "model.layers.0.block_sparse_moe.experts.1.w2.weight": "model-00001-of-00016.safetensors", "model.layers.0.block_sparse_moe.experts.2.w2.weight": "model-00001-of-00016.safetensors", "model.layers.0.block_sparse_moe.experts.3.w2.weight": "model-00001-of-00016.safetensors", "model.layers.1.input_layernorm.weight": "model-00001-of-00016.safetensors", "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00016.safetensors", "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00016.safetensors", "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00016.safetensors", "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00016.safetensors", "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00016.safetensors", "model.layers.1.block_sparse_moe.experts.0.w3.weight": "model-00001-of-00016.safetensors", "model.layers.1.block_sparse_moe.experts.1.w3.weight": "model-00001-of-00016.safetensors", "model.layers.1.block_sparse_moe.experts.2.w3.weight": "model-00001-of-00016.safetensors", "model.layers.1.block_sparse_moe.experts.3.w3.weight": "model-00001-of-00016.safetensors", "model.layers.1.block_sparse_moe.experts.0.w1.weight": "model-00001-of-00016.safetensors", "model.layers.1.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00016.safetensors", "model.layers.1.block_sparse_moe.experts.2.w1.weight": "model-00001-of-00016.safetensors", "model.layers.1.block_sparse_moe.experts.3.w1.weight": "model-00001-of-00016.safetensors", "model.layers.1.block_sparse_moe.experts.0.w2.weight": "model-00001-of-00016.safetensors", "model.layers.1.block_sparse_moe.experts.1.w2.weight": "model-00001-of-00016.safetensors", "model.layers.1.block_sparse_moe.experts.2.w2.weight": "model-00001-of-00016.safetensors", "model.layers.1.block_sparse_moe.experts.3.w2.weight": "model-00002-of-00016.safetensors", "model.layers.2.input_layernorm.weight": "model-00002-of-00016.safetensors", "model.layers.2.self_attn.q_proj.weight": "model-00002-of-00016.safetensors", "model.layers.2.self_attn.k_proj.weight": "model-00002-of-00016.safetensors", "model.layers.2.self_attn.v_proj.weight": "model-00002-of-00016.safetensors", "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00016.safetensors", "model.layers.2.post_attention_layernorm.weight": "model-00002-of-00016.safetensors", "model.layers.2.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00016.safetensors", "model.layers.2.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00016.safetensors", "model.layers.2.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00016.safetensors", "model.layers.2.block_sparse_moe.experts.3.w3.weight": "model-00002-of-00016.safetensors", "model.layers.2.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00016.safetensors", "model.layers.2.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00016.safetensors", "model.layers.2.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00016.safetensors", "model.layers.2.block_sparse_moe.experts.3.w1.weight": "model-00002-of-00016.safetensors", "model.layers.2.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00016.safetensors", "model.layers.2.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00016.safetensors", "model.layers.2.block_sparse_moe.experts.2.w2.weight": "model-00002-of-00016.safetensors", "model.layers.2.block_sparse_moe.experts.3.w2.weight": "model-00002-of-00016.safetensors", "model.layers.3.input_layernorm.weight": "model-00002-of-00016.safetensors", "model.layers.3.self_attn.q_proj.weight": "model-00002-of-00016.safetensors", "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00016.safetensors", "model.layers.3.self_attn.v_proj.weight": "model-00002-of-00016.safetensors", "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00016.safetensors", "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00016.safetensors", "model.layers.3.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00016.safetensors", "model.layers.3.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00016.safetensors", "model.layers.3.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00016.safetensors", "model.layers.3.block_sparse_moe.experts.3.w3.weight": "model-00002-of-00016.safetensors", "model.layers.3.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00016.safetensors", "model.layers.3.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00016.safetensors", "model.layers.3.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00016.safetensors", "model.layers.3.block_sparse_moe.experts.3.w1.weight": "model-00002-of-00016.safetensors", "model.layers.3.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00016.safetensors", "model.layers.3.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00016.safetensors", "model.layers.3.block_sparse_moe.experts.2.w2.weight": "model-00002-of-00016.safetensors", "model.layers.3.block_sparse_moe.experts.3.w2.weight": "model-00002-of-00016.safetensors", "model.layers.4.input_layernorm.weight": "model-00002-of-00016.safetensors", "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00016.safetensors", "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00016.safetensors", "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00016.safetensors", "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00016.safetensors", "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00016.safetensors", "model.layers.4.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00016.safetensors", "model.layers.4.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00016.safetensors", "model.layers.4.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00016.safetensors", "model.layers.4.block_sparse_moe.experts.3.w3.weight": "model-00002-of-00016.safetensors", "model.layers.4.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00016.safetensors", "model.layers.4.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00016.safetensors", "model.layers.4.block_sparse_moe.experts.2.w1.weight": "model-00003-of-00016.safetensors", "model.layers.4.block_sparse_moe.experts.3.w1.weight": "model-00003-of-00016.safetensors", "model.layers.4.block_sparse_moe.experts.0.w2.weight": "model-00003-of-00016.safetensors", "model.layers.4.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00016.safetensors", "model.layers.4.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00016.safetensors", "model.layers.4.block_sparse_moe.experts.3.w2.weight": "model-00003-of-00016.safetensors", "model.layers.5.input_layernorm.weight": "model-00003-of-00016.safetensors", "model.layers.5.self_attn.q_proj.weight": "model-00003-of-00016.safetensors", "model.layers.5.self_attn.k_proj.weight": "model-00003-of-00016.safetensors", "model.layers.5.self_attn.v_proj.weight": "model-00003-of-00016.safetensors", "model.layers.5.self_attn.o_proj.weight": "model-00003-of-00016.safetensors", "model.layers.5.post_attention_layernorm.weight": "model-00003-of-00016.safetensors", "model.layers.5.block_sparse_moe.experts.0.w3.weight": "model-00003-of-00016.safetensors", "model.layers.5.block_sparse_moe.experts.1.w3.weight": "model-00003-of-00016.safetensors", "model.layers.5.block_sparse_moe.experts.2.w3.weight": "model-00003-of-00016.safetensors", "model.layers.5.block_sparse_moe.experts.3.w3.weight": "model-00003-of-00016.safetensors", "model.layers.5.block_sparse_moe.experts.0.w1.weight": "model-00003-of-00016.safetensors", "model.layers.5.block_sparse_moe.experts.1.w1.weight": "model-00003-of-00016.safetensors", "model.layers.5.block_sparse_moe.experts.2.w1.weight": "model-00003-of-00016.safetensors", "model.layers.5.block_sparse_moe.experts.3.w1.weight": "model-00003-of-00016.safetensors", "model.layers.5.block_sparse_moe.experts.0.w2.weight": "model-00003-of-00016.safetensors", "model.layers.5.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00016.safetensors", "model.layers.5.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00016.safetensors", "model.layers.5.block_sparse_moe.experts.3.w2.weight": "model-00003-of-00016.safetensors", "model.layers.6.input_layernorm.weight": "model-00003-of-00016.safetensors", "model.layers.6.self_attn.q_proj.weight": "model-00003-of-00016.safetensors", "model.layers.6.self_attn.k_proj.weight": "model-00003-of-00016.safetensors", "model.layers.6.self_attn.v_proj.weight": "model-00003-of-00016.safetensors", "model.layers.6.self_attn.o_proj.weight": "model-00003-of-00016.safetensors", "model.layers.6.post_attention_layernorm.weight": "model-00003-of-00016.safetensors", "model.layers.6.block_sparse_moe.experts.0.w3.weight": "model-00003-of-00016.safetensors", "model.layers.6.block_sparse_moe.experts.1.w3.weight": "model-00003-of-00016.safetensors", "model.layers.6.block_sparse_moe.experts.2.w3.weight": "model-00003-of-00016.safetensors", "model.layers.6.block_sparse_moe.experts.3.w3.weight": "model-00003-of-00016.safetensors", "model.layers.6.block_sparse_moe.experts.0.w1.weight": "model-00003-of-00016.safetensors", "model.layers.6.block_sparse_moe.experts.1.w1.weight": "model-00003-of-00016.safetensors", "model.layers.6.block_sparse_moe.experts.2.w1.weight": "model-00003-of-00016.safetensors", "model.layers.6.block_sparse_moe.experts.3.w1.weight": "model-00003-of-00016.safetensors", "model.layers.6.block_sparse_moe.experts.0.w2.weight": "model-00003-of-00016.safetensors", "model.layers.6.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00016.safetensors", "model.layers.6.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00016.safetensors", "model.layers.6.block_sparse_moe.experts.3.w2.weight": "model-00003-of-00016.safetensors", "model.layers.7.input_layernorm.weight": "model-00003-of-00016.safetensors", "model.layers.7.self_attn.q_proj.weight": "model-00003-of-00016.safetensors", "model.layers.7.self_attn.k_proj.weight": "model-00003-of-00016.safetensors", "model.layers.7.self_attn.v_proj.weight": "model-00003-of-00016.safetensors", "model.layers.7.self_attn.o_proj.weight": "model-00003-of-00016.safetensors", "model.layers.7.post_attention_layernorm.weight": "model-00003-of-00016.safetensors", "model.layers.7.block_sparse_moe.experts.0.w3.weight": "model-00003-of-00016.safetensors", "model.layers.7.block_sparse_moe.experts.1.w3.weight": "model-00004-of-00016.safetensors", "model.layers.7.block_sparse_moe.experts.2.w3.weight": "model-00004-of-00016.safetensors", "model.layers.7.block_sparse_moe.experts.3.w3.weight": "model-00004-of-00016.safetensors", "model.layers.7.block_sparse_moe.experts.0.w1.weight": "model-00004-of-00016.safetensors", "model.layers.7.block_sparse_moe.experts.1.w1.weight": "model-00004-of-00016.safetensors", "model.layers.7.block_sparse_moe.experts.2.w1.weight": "model-00004-of-00016.safetensors", "model.layers.7.block_sparse_moe.experts.3.w1.weight": "model-00004-of-00016.safetensors", "model.layers.7.block_sparse_moe.experts.0.w2.weight": "model-00004-of-00016.safetensors", "model.layers.7.block_sparse_moe.experts.1.w2.weight": "model-00004-of-00016.safetensors", "model.layers.7.block_sparse_moe.experts.2.w2.weight": "model-00004-of-00016.safetensors", "model.layers.7.block_sparse_moe.experts.3.w2.weight": "model-00004-of-00016.safetensors", "model.layers.8.input_layernorm.weight": "model-00004-of-00016.safetensors", "model.layers.8.self_attn.q_proj.weight": "model-00004-of-00016.safetensors", "model.layers.8.self_attn.k_proj.weight": "model-00004-of-00016.safetensors", "model.layers.8.self_attn.v_proj.weight": "model-00004-of-00016.safetensors", "model.layers.8.self_attn.o_proj.weight": "model-00004-of-00016.safetensors", "model.layers.8.post_attention_layernorm.weight": "model-00004-of-00016.safetensors", "model.layers.8.block_sparse_moe.experts.0.w3.weight": "model-00004-of-00016.safetensors", "model.layers.8.block_sparse_moe.experts.1.w3.weight": "model-00004-of-00016.safetensors", "model.layers.8.block_sparse_moe.experts.2.w3.weight": "model-00004-of-00016.safetensors", "model.layers.8.block_sparse_moe.experts.3.w3.weight": "model-00004-of-00016.safetensors", "model.layers.8.block_sparse_moe.experts.0.w1.weight": "model-00004-of-00016.safetensors", "model.layers.8.block_sparse_moe.experts.1.w1.weight": "model-00004-of-00016.safetensors", "model.layers.8.block_sparse_moe.experts.2.w1.weight": "model-00004-of-00016.safetensors", "model.layers.8.block_sparse_moe.experts.3.w1.weight": "model-00004-of-00016.safetensors", "model.layers.8.block_sparse_moe.experts.0.w2.weight": "model-00004-of-00016.safetensors", "model.layers.8.block_sparse_moe.experts.1.w2.weight": "model-00004-of-00016.safetensors", "model.layers.8.block_sparse_moe.experts.2.w2.weight": "model-00004-of-00016.safetensors", "model.layers.8.block_sparse_moe.experts.3.w2.weight": "model-00004-of-00016.safetensors", "model.layers.9.input_layernorm.weight": "model-00004-of-00016.safetensors", "model.layers.9.self_attn.q_proj.weight": "model-00004-of-00016.safetensors", "model.layers.9.self_attn.k_proj.weight": "model-00004-of-00016.safetensors", "model.layers.9.self_attn.v_proj.weight": "model-00004-of-00016.safetensors", "model.layers.9.self_attn.o_proj.weight": "model-00004-of-00016.safetensors", "model.layers.9.post_attention_layernorm.weight": "model-00004-of-00016.safetensors", "model.layers.9.block_sparse_moe.experts.0.w3.weight": "model-00004-of-00016.safetensors", "model.layers.9.block_sparse_moe.experts.1.w3.weight": "model-00004-of-00016.safetensors", "model.layers.9.block_sparse_moe.experts.2.w3.weight": "model-00004-of-00016.safetensors", "model.layers.9.block_sparse_moe.experts.3.w3.weight": "model-00004-of-00016.safetensors", "model.layers.9.block_sparse_moe.experts.0.w1.weight": "model-00004-of-00016.safetensors", "model.layers.9.block_sparse_moe.experts.1.w1.weight": "model-00004-of-00016.safetensors", "model.layers.9.block_sparse_moe.experts.2.w1.weight": "model-00004-of-00016.safetensors", "model.layers.9.block_sparse_moe.experts.3.w1.weight": "model-00004-of-00016.safetensors", "model.layers.9.block_sparse_moe.experts.0.w2.weight": "model-00004-of-00016.safetensors", "model.layers.9.block_sparse_moe.experts.1.w2.weight": "model-00005-of-00016.safetensors", "model.layers.9.block_sparse_moe.experts.2.w2.weight": "model-00005-of-00016.safetensors", "model.layers.9.block_sparse_moe.experts.3.w2.weight": "model-00005-of-00016.safetensors", "model.layers.10.input_layernorm.weight": "model-00005-of-00016.safetensors", "model.layers.10.self_attn.q_proj.weight": "model-00005-of-00016.safetensors", "model.layers.10.self_attn.k_proj.weight": "model-00005-of-00016.safetensors", "model.layers.10.self_attn.v_proj.weight": "model-00005-of-00016.safetensors", "model.layers.10.self_attn.o_proj.weight": "model-00005-of-00016.safetensors", "model.layers.10.post_attention_layernorm.weight": "model-00005-of-00016.safetensors", "model.layers.10.block_sparse_moe.experts.0.w3.weight": "model-00005-of-00016.safetensors", "model.layers.10.block_sparse_moe.experts.1.w3.weight": "model-00005-of-00016.safetensors", "model.layers.10.block_sparse_moe.experts.2.w3.weight": "model-00005-of-00016.safetensors", "model.layers.10.block_sparse_moe.experts.3.w3.weight": "model-00005-of-00016.safetensors", "model.layers.10.block_sparse_moe.experts.0.w1.weight": "model-00005-of-00016.safetensors", "model.layers.10.block_sparse_moe.experts.1.w1.weight": "model-00005-of-00016.safetensors", "model.layers.10.block_sparse_moe.experts.2.w1.weight": "model-00005-of-00016.safetensors", "model.layers.10.block_sparse_moe.experts.3.w1.weight": "model-00005-of-00016.safetensors", "model.layers.10.block_sparse_moe.experts.0.w2.weight": "model-00005-of-00016.safetensors", "model.layers.10.block_sparse_moe.experts.1.w2.weight": "model-00005-of-00016.safetensors", "model.layers.10.block_sparse_moe.experts.2.w2.weight": "model-00005-of-00016.safetensors", "model.layers.10.block_sparse_moe.experts.3.w2.weight": "model-00005-of-00016.safetensors", "model.layers.11.input_layernorm.weight": "model-00005-of-00016.safetensors", "model.layers.11.self_attn.q_proj.weight": "model-00005-of-00016.safetensors", "model.layers.11.self_attn.k_proj.weight": "model-00005-of-00016.safetensors", "model.layers.11.self_attn.v_proj.weight": "model-00005-of-00016.safetensors", "model.layers.11.self_attn.o_proj.weight": "model-00005-of-00016.safetensors", "model.layers.11.post_attention_layernorm.weight": "model-00005-of-00016.safetensors", "model.layers.11.block_sparse_moe.experts.0.w3.weight": "model-00005-of-00016.safetensors", "model.layers.11.block_sparse_moe.experts.1.w3.weight": "model-00005-of-00016.safetensors", "model.layers.11.block_sparse_moe.experts.2.w3.weight": "model-00005-of-00016.safetensors", "model.layers.11.block_sparse_moe.experts.3.w3.weight": "model-00005-of-00016.safetensors", "model.layers.11.block_sparse_moe.experts.0.w1.weight": "model-00005-of-00016.safetensors", "model.layers.11.block_sparse_moe.experts.1.w1.weight": "model-00005-of-00016.safetensors", "model.layers.11.block_sparse_moe.experts.2.w1.weight": "model-00005-of-00016.safetensors", "model.layers.11.block_sparse_moe.experts.3.w1.weight": "model-00005-of-00016.safetensors", "model.layers.11.block_sparse_moe.experts.0.w2.weight": "model-00005-of-00016.safetensors", "model.layers.11.block_sparse_moe.experts.1.w2.weight": "model-00005-of-00016.safetensors", "model.layers.11.block_sparse_moe.experts.2.w2.weight": "model-00005-of-00016.safetensors", "model.layers.11.block_sparse_moe.experts.3.w2.weight": "model-00005-of-00016.safetensors", "model.layers.12.input_layernorm.weight": "model-00005-of-00016.safetensors", "model.layers.12.self_attn.q_proj.weight": "model-00005-of-00016.safetensors", "model.layers.12.self_attn.k_proj.weight": "model-00005-of-00016.safetensors", "model.layers.12.self_attn.v_proj.weight": "model-00005-of-00016.safetensors", "model.layers.12.self_attn.o_proj.weight": "model-00005-of-00016.safetensors", "model.layers.12.post_attention_layernorm.weight": "model-00005-of-00016.safetensors", "model.layers.12.block_sparse_moe.experts.0.w3.weight": "model-00005-of-00016.safetensors", "model.layers.12.block_sparse_moe.experts.1.w3.weight": "model-00005-of-00016.safetensors", "model.layers.12.block_sparse_moe.experts.2.w3.weight": "model-00005-of-00016.safetensors", "model.layers.12.block_sparse_moe.experts.3.w3.weight": "model-00005-of-00016.safetensors", "model.layers.12.block_sparse_moe.experts.0.w1.weight": "model-00006-of-00016.safetensors", "model.layers.12.block_sparse_moe.experts.1.w1.weight": "model-00006-of-00016.safetensors", "model.layers.12.block_sparse_moe.experts.2.w1.weight": "model-00006-of-00016.safetensors", "model.layers.12.block_sparse_moe.experts.3.w1.weight": "model-00006-of-00016.safetensors", "model.layers.12.block_sparse_moe.experts.0.w2.weight": "model-00006-of-00016.safetensors", "model.layers.12.block_sparse_moe.experts.1.w2.weight": "model-00006-of-00016.safetensors", "model.layers.12.block_sparse_moe.experts.2.w2.weight": "model-00006-of-00016.safetensors", "model.layers.12.block_sparse_moe.experts.3.w2.weight": "model-00006-of-00016.safetensors", "model.layers.13.input_layernorm.weight": "model-00006-of-00016.safetensors", "model.layers.13.self_attn.q_proj.weight": "model-00006-of-00016.safetensors", "model.layers.13.self_attn.k_proj.weight": "model-00006-of-00016.safetensors", "model.layers.13.self_attn.v_proj.weight": "model-00006-of-00016.safetensors", "model.layers.13.self_attn.o_proj.weight": "model-00006-of-00016.safetensors", "model.layers.13.post_attention_layernorm.weight": "model-00006-of-00016.safetensors", "model.layers.13.block_sparse_moe.experts.0.w3.weight": "model-00006-of-00016.safetensors", "model.layers.13.block_sparse_moe.experts.1.w3.weight": "model-00006-of-00016.safetensors", "model.layers.13.block_sparse_moe.experts.2.w3.weight": "model-00006-of-00016.safetensors", "model.layers.13.block_sparse_moe.experts.3.w3.weight": "model-00006-of-00016.safetensors", "model.layers.13.block_sparse_moe.experts.0.w1.weight": "model-00006-of-00016.safetensors", "model.layers.13.block_sparse_moe.experts.1.w1.weight": "model-00006-of-00016.safetensors", "model.layers.13.block_sparse_moe.experts.2.w1.weight": "model-00006-of-00016.safetensors", "model.layers.13.block_sparse_moe.experts.3.w1.weight": "model-00006-of-00016.safetensors", "model.layers.13.block_sparse_moe.experts.0.w2.weight": "model-00006-of-00016.safetensors", "model.layers.13.block_sparse_moe.experts.1.w2.weight": "model-00006-of-00016.safetensors", "model.layers.13.block_sparse_moe.experts.2.w2.weight": "model-00006-of-00016.safetensors", "model.layers.13.block_sparse_moe.experts.3.w2.weight": "model-00006-of-00016.safetensors", "model.layers.14.input_layernorm.weight": "model-00006-of-00016.safetensors", "model.layers.14.self_attn.q_proj.weight": "model-00006-of-00016.safetensors", "model.layers.14.self_attn.k_proj.weight": "model-00006-of-00016.safetensors", "model.layers.14.self_attn.v_proj.weight": "model-00006-of-00016.safetensors", "model.layers.14.self_attn.o_proj.weight": "model-00006-of-00016.safetensors", "model.layers.14.post_attention_layernorm.weight": "model-00006-of-00016.safetensors", "model.layers.14.block_sparse_moe.experts.0.w3.weight": "model-00006-of-00016.safetensors", "model.layers.14.block_sparse_moe.experts.1.w3.weight": "model-00006-of-00016.safetensors", "model.layers.14.block_sparse_moe.experts.2.w3.weight": "model-00006-of-00016.safetensors", "model.layers.14.block_sparse_moe.experts.3.w3.weight": "model-00006-of-00016.safetensors", "model.layers.14.block_sparse_moe.experts.0.w1.weight": "model-00006-of-00016.safetensors", "model.layers.14.block_sparse_moe.experts.1.w1.weight": "model-00006-of-00016.safetensors", "model.layers.14.block_sparse_moe.experts.2.w1.weight": "model-00006-of-00016.safetensors", "model.layers.14.block_sparse_moe.experts.3.w1.weight": "model-00006-of-00016.safetensors", "model.layers.14.block_sparse_moe.experts.0.w2.weight": "model-00006-of-00016.safetensors", "model.layers.14.block_sparse_moe.experts.1.w2.weight": "model-00006-of-00016.safetensors", "model.layers.14.block_sparse_moe.experts.2.w2.weight": "model-00006-of-00016.safetensors", "model.layers.14.block_sparse_moe.experts.3.w2.weight": "model-00006-of-00016.safetensors", "model.layers.15.input_layernorm.weight": "model-00006-of-00016.safetensors", "model.layers.15.self_attn.q_proj.weight": "model-00006-of-00016.safetensors", "model.layers.15.self_attn.k_proj.weight": "model-00006-of-00016.safetensors", "model.layers.15.self_attn.v_proj.weight": "model-00006-of-00016.safetensors", "model.layers.15.self_attn.o_proj.weight": "model-00007-of-00016.safetensors", "model.layers.15.post_attention_layernorm.weight": "model-00007-of-00016.safetensors", "model.layers.15.block_sparse_moe.experts.0.w3.weight": "model-00007-of-00016.safetensors", "model.layers.15.block_sparse_moe.experts.1.w3.weight": "model-00007-of-00016.safetensors", "model.layers.15.block_sparse_moe.experts.2.w3.weight": "model-00007-of-00016.safetensors", "model.layers.15.block_sparse_moe.experts.3.w3.weight": "model-00007-of-00016.safetensors", "model.layers.15.block_sparse_moe.experts.0.w1.weight": "model-00007-of-00016.safetensors", "model.layers.15.block_sparse_moe.experts.1.w1.weight": "model-00007-of-00016.safetensors", "model.layers.15.block_sparse_moe.experts.2.w1.weight": "model-00007-of-00016.safetensors", "model.layers.15.block_sparse_moe.experts.3.w1.weight": "model-00007-of-00016.safetensors", "model.layers.15.block_sparse_moe.experts.0.w2.weight": "model-00007-of-00016.safetensors", "model.layers.15.block_sparse_moe.experts.1.w2.weight": "model-00007-of-00016.safetensors", "model.layers.15.block_sparse_moe.experts.2.w2.weight": "model-00007-of-00016.safetensors", "model.layers.15.block_sparse_moe.experts.3.w2.weight": "model-00007-of-00016.safetensors", "model.layers.16.input_layernorm.weight": "model-00007-of-00016.safetensors", "model.layers.16.self_attn.q_proj.weight": "model-00007-of-00016.safetensors", "model.layers.16.self_attn.k_proj.weight": "model-00007-of-00016.safetensors", "model.layers.16.self_attn.v_proj.weight": "model-00007-of-00016.safetensors", "model.layers.16.self_attn.o_proj.weight": "model-00007-of-00016.safetensors", "model.layers.16.post_attention_layernorm.weight": "model-00007-of-00016.safetensors", "model.layers.16.block_sparse_moe.experts.0.w3.weight": "model-00007-of-00016.safetensors", "model.layers.16.block_sparse_moe.experts.1.w3.weight": "model-00007-of-00016.safetensors", "model.layers.16.block_sparse_moe.experts.2.w3.weight": "model-00007-of-00016.safetensors", "model.layers.16.block_sparse_moe.experts.3.w3.weight": "model-00007-of-00016.safetensors", "model.layers.16.block_sparse_moe.experts.0.w1.weight": "model-00007-of-00016.safetensors", "model.layers.16.block_sparse_moe.experts.1.w1.weight": "model-00007-of-00016.safetensors", "model.layers.16.block_sparse_moe.experts.2.w1.weight": "model-00007-of-00016.safetensors", "model.layers.16.block_sparse_moe.experts.3.w1.weight": "model-00007-of-00016.safetensors", "model.layers.16.block_sparse_moe.experts.0.w2.weight": "model-00007-of-00016.safetensors", "model.layers.16.block_sparse_moe.experts.1.w2.weight": "model-00007-of-00016.safetensors", "model.layers.16.block_sparse_moe.experts.2.w2.weight": "model-00007-of-00016.safetensors", "model.layers.16.block_sparse_moe.experts.3.w2.weight": "model-00007-of-00016.safetensors", "model.layers.17.input_layernorm.weight": "model-00007-of-00016.safetensors", "model.layers.17.self_attn.q_proj.weight": "model-00007-of-00016.safetensors", "model.layers.17.self_attn.k_proj.weight": "model-00007-of-00016.safetensors", "model.layers.17.self_attn.v_proj.weight": "model-00007-of-00016.safetensors", "model.layers.17.self_attn.o_proj.weight": "model-00007-of-00016.safetensors", "model.layers.17.post_attention_layernorm.weight": "model-00007-of-00016.safetensors", "model.layers.17.block_sparse_moe.experts.0.w3.weight": "model-00007-of-00016.safetensors", "model.layers.17.block_sparse_moe.experts.1.w3.weight": "model-00007-of-00016.safetensors", "model.layers.17.block_sparse_moe.experts.2.w3.weight": "model-00007-of-00016.safetensors", "model.layers.17.block_sparse_moe.experts.3.w3.weight": "model-00007-of-00016.safetensors", "model.layers.17.block_sparse_moe.experts.0.w1.weight": "model-00007-of-00016.safetensors", "model.layers.17.block_sparse_moe.experts.1.w1.weight": "model-00007-of-00016.safetensors", "model.layers.17.block_sparse_moe.experts.2.w1.weight": "model-00007-of-00016.safetensors", "model.layers.17.block_sparse_moe.experts.3.w1.weight": "model-00007-of-00016.safetensors", "model.layers.17.block_sparse_moe.experts.0.w2.weight": "model-00008-of-00016.safetensors", "model.layers.17.block_sparse_moe.experts.1.w2.weight": "model-00008-of-00016.safetensors", "model.layers.17.block_sparse_moe.experts.2.w2.weight": "model-00008-of-00016.safetensors", "model.layers.17.block_sparse_moe.experts.3.w2.weight": "model-00008-of-00016.safetensors", "model.layers.18.input_layernorm.weight": "model-00008-of-00016.safetensors", "model.layers.18.self_attn.q_proj.weight": "model-00008-of-00016.safetensors", "model.layers.18.self_attn.k_proj.weight": "model-00008-of-00016.safetensors", "model.layers.18.self_attn.v_proj.weight": "model-00008-of-00016.safetensors", "model.layers.18.self_attn.o_proj.weight": "model-00008-of-00016.safetensors", "model.layers.18.post_attention_layernorm.weight": "model-00008-of-00016.safetensors", "model.layers.18.block_sparse_moe.experts.0.w3.weight": "model-00008-of-00016.safetensors", "model.layers.18.block_sparse_moe.experts.1.w3.weight": "model-00008-of-00016.safetensors", "model.layers.18.block_sparse_moe.experts.2.w3.weight": "model-00008-of-00016.safetensors", "model.layers.18.block_sparse_moe.experts.3.w3.weight": "model-00008-of-00016.safetensors", "model.layers.18.block_sparse_moe.experts.0.w1.weight": "model-00008-of-00016.safetensors", "model.layers.18.block_sparse_moe.experts.1.w1.weight": "model-00008-of-00016.safetensors", "model.layers.18.block_sparse_moe.experts.2.w1.weight": "model-00008-of-00016.safetensors", "model.layers.18.block_sparse_moe.experts.3.w1.weight": "model-00008-of-00016.safetensors", "model.layers.18.block_sparse_moe.experts.0.w2.weight": "model-00008-of-00016.safetensors", "model.layers.18.block_sparse_moe.experts.1.w2.weight": "model-00008-of-00016.safetensors", "model.layers.18.block_sparse_moe.experts.2.w2.weight": "model-00008-of-00016.safetensors", "model.layers.18.block_sparse_moe.experts.3.w2.weight": "model-00008-of-00016.safetensors", "model.layers.19.input_layernorm.weight": "model-00008-of-00016.safetensors", "model.layers.19.self_attn.q_proj.weight": "model-00008-of-00016.safetensors", "model.layers.19.self_attn.k_proj.weight": "model-00008-of-00016.safetensors", "model.layers.19.self_attn.v_proj.weight": "model-00008-of-00016.safetensors", "model.layers.19.self_attn.o_proj.weight": "model-00008-of-00016.safetensors", "model.layers.19.post_attention_layernorm.weight": "model-00008-of-00016.safetensors", "model.layers.19.block_sparse_moe.experts.0.w3.weight": "model-00008-of-00016.safetensors", "model.layers.19.block_sparse_moe.experts.1.w3.weight": "model-00008-of-00016.safetensors", "model.layers.19.block_sparse_moe.experts.2.w3.weight": "model-00008-of-00016.safetensors", "model.layers.19.block_sparse_moe.experts.3.w3.weight": "model-00008-of-00016.safetensors", "model.layers.19.block_sparse_moe.experts.0.w1.weight": "model-00008-of-00016.safetensors", "model.layers.19.block_sparse_moe.experts.1.w1.weight": "model-00008-of-00016.safetensors", "model.layers.19.block_sparse_moe.experts.2.w1.weight": "model-00008-of-00016.safetensors", "model.layers.19.block_sparse_moe.experts.3.w1.weight": "model-00008-of-00016.safetensors", "model.layers.19.block_sparse_moe.experts.0.w2.weight": "model-00008-of-00016.safetensors", "model.layers.19.block_sparse_moe.experts.1.w2.weight": "model-00008-of-00016.safetensors", "model.layers.19.block_sparse_moe.experts.2.w2.weight": "model-00008-of-00016.safetensors", "model.layers.19.block_sparse_moe.experts.3.w2.weight": "model-00008-of-00016.safetensors", "model.layers.20.input_layernorm.weight": "model-00008-of-00016.safetensors", "model.layers.20.self_attn.q_proj.weight": "model-00008-of-00016.safetensors", "model.layers.20.self_attn.k_proj.weight": "model-00008-of-00016.safetensors", "model.layers.20.self_attn.v_proj.weight": "model-00008-of-00016.safetensors", "model.layers.20.self_attn.o_proj.weight": "model-00008-of-00016.safetensors", "model.layers.20.post_attention_layernorm.weight": "model-00008-of-00016.safetensors", "model.layers.20.block_sparse_moe.experts.0.w3.weight": "model-00008-of-00016.safetensors", "model.layers.20.block_sparse_moe.experts.1.w3.weight": "model-00008-of-00016.safetensors", "model.layers.20.block_sparse_moe.experts.2.w3.weight": "model-00008-of-00016.safetensors", "model.layers.20.block_sparse_moe.experts.3.w3.weight": "model-00009-of-00016.safetensors", "model.layers.20.block_sparse_moe.experts.0.w1.weight": "model-00009-of-00016.safetensors", "model.layers.20.block_sparse_moe.experts.1.w1.weight": "model-00009-of-00016.safetensors", "model.layers.20.block_sparse_moe.experts.2.w1.weight": "model-00009-of-00016.safetensors", "model.layers.20.block_sparse_moe.experts.3.w1.weight": "model-00009-of-00016.safetensors", "model.layers.20.block_sparse_moe.experts.0.w2.weight": "model-00009-of-00016.safetensors", "model.layers.20.block_sparse_moe.experts.1.w2.weight": "model-00009-of-00016.safetensors", "model.layers.20.block_sparse_moe.experts.2.w2.weight": "model-00009-of-00016.safetensors", "model.layers.20.block_sparse_moe.experts.3.w2.weight": "model-00009-of-00016.safetensors", "model.layers.21.input_layernorm.weight": "model-00009-of-00016.safetensors", "model.layers.21.self_attn.q_proj.weight": "model-00009-of-00016.safetensors", "model.layers.21.self_attn.k_proj.weight": "model-00009-of-00016.safetensors", "model.layers.21.self_attn.v_proj.weight": "model-00009-of-00016.safetensors", "model.layers.21.self_attn.o_proj.weight": "model-00009-of-00016.safetensors", "model.layers.21.post_attention_layernorm.weight": "model-00009-of-00016.safetensors", "model.layers.21.block_sparse_moe.experts.0.w3.weight": "model-00009-of-00016.safetensors", "model.layers.21.block_sparse_moe.experts.1.w3.weight": "model-00009-of-00016.safetensors", "model.layers.21.block_sparse_moe.experts.2.w3.weight": "model-00009-of-00016.safetensors", "model.layers.21.block_sparse_moe.experts.3.w3.weight": "model-00009-of-00016.safetensors", "model.layers.21.block_sparse_moe.experts.0.w1.weight": "model-00009-of-00016.safetensors", "model.layers.21.block_sparse_moe.experts.1.w1.weight": "model-00009-of-00016.safetensors", "model.layers.21.block_sparse_moe.experts.2.w1.weight": "model-00009-of-00016.safetensors", "model.layers.21.block_sparse_moe.experts.3.w1.weight": "model-00009-of-00016.safetensors", "model.layers.21.block_sparse_moe.experts.0.w2.weight": "model-00009-of-00016.safetensors", "model.layers.21.block_sparse_moe.experts.1.w2.weight": "model-00009-of-00016.safetensors", "model.layers.21.block_sparse_moe.experts.2.w2.weight": "model-00009-of-00016.safetensors", "model.layers.21.block_sparse_moe.experts.3.w2.weight": "model-00009-of-00016.safetensors", "model.layers.22.input_layernorm.weight": "model-00009-of-00016.safetensors", "model.layers.22.self_attn.q_proj.weight": "model-00009-of-00016.safetensors", "model.layers.22.self_attn.k_proj.weight": "model-00009-of-00016.safetensors", "model.layers.22.self_attn.v_proj.weight": "model-00009-of-00016.safetensors", "model.layers.22.self_attn.o_proj.weight": "model-00009-of-00016.safetensors", "model.layers.22.post_attention_layernorm.weight": "model-00009-of-00016.safetensors", "model.layers.22.block_sparse_moe.experts.0.w3.weight": "model-00009-of-00016.safetensors", "model.layers.22.block_sparse_moe.experts.1.w3.weight": "model-00009-of-00016.safetensors", "model.layers.22.block_sparse_moe.experts.2.w3.weight": "model-00009-of-00016.safetensors", "model.layers.22.block_sparse_moe.experts.3.w3.weight": "model-00009-of-00016.safetensors", "model.layers.22.block_sparse_moe.experts.0.w1.weight": "model-00009-of-00016.safetensors", "model.layers.22.block_sparse_moe.experts.1.w1.weight": "model-00009-of-00016.safetensors", "model.layers.22.block_sparse_moe.experts.2.w1.weight": "model-00009-of-00016.safetensors", "model.layers.22.block_sparse_moe.experts.3.w1.weight": "model-00009-of-00016.safetensors", "model.layers.22.block_sparse_moe.experts.0.w2.weight": "model-00009-of-00016.safetensors", "model.layers.22.block_sparse_moe.experts.1.w2.weight": "model-00009-of-00016.safetensors", "model.layers.22.block_sparse_moe.experts.2.w2.weight": "model-00009-of-00016.safetensors", "model.layers.22.block_sparse_moe.experts.3.w2.weight": "model-00010-of-00016.safetensors", "model.layers.23.input_layernorm.weight": "model-00010-of-00016.safetensors", "model.layers.23.self_attn.q_proj.weight": "model-00010-of-00016.safetensors", "model.layers.23.self_attn.k_proj.weight": "model-00010-of-00016.safetensors", "model.layers.23.self_attn.v_proj.weight": "model-00010-of-00016.safetensors", "model.layers.23.self_attn.o_proj.weight": "model-00010-of-00016.safetensors", "model.layers.23.post_attention_layernorm.weight": "model-00010-of-00016.safetensors", "model.layers.23.block_sparse_moe.experts.0.w3.weight": "model-00010-of-00016.safetensors", "model.layers.23.block_sparse_moe.experts.1.w3.weight": "model-00010-of-00016.safetensors", "model.layers.23.block_sparse_moe.experts.2.w3.weight": "model-00010-of-00016.safetensors", "model.layers.23.block_sparse_moe.experts.3.w3.weight": "model-00010-of-00016.safetensors", "model.layers.23.block_sparse_moe.experts.0.w1.weight": "model-00010-of-00016.safetensors", "model.layers.23.block_sparse_moe.experts.1.w1.weight": "model-00010-of-00016.safetensors", "model.layers.23.block_sparse_moe.experts.2.w1.weight": "model-00010-of-00016.safetensors", "model.layers.23.block_sparse_moe.experts.3.w1.weight": "model-00010-of-00016.safetensors", "model.layers.23.block_sparse_moe.experts.0.w2.weight": "model-00010-of-00016.safetensors", "model.layers.23.block_sparse_moe.experts.1.w2.weight": "model-00010-of-00016.safetensors", "model.layers.23.block_sparse_moe.experts.2.w2.weight": "model-00010-of-00016.safetensors", "model.layers.23.block_sparse_moe.experts.3.w2.weight": "model-00010-of-00016.safetensors", "model.layers.24.input_layernorm.weight": "model-00010-of-00016.safetensors", "model.layers.24.self_attn.q_proj.weight": "model-00010-of-00016.safetensors", "model.layers.24.self_attn.k_proj.weight": "model-00010-of-00016.safetensors", "model.layers.24.self_attn.v_proj.weight": "model-00010-of-00016.safetensors", "model.layers.24.self_attn.o_proj.weight": "model-00010-of-00016.safetensors", "model.layers.24.post_attention_layernorm.weight": "model-00010-of-00016.safetensors", "model.layers.24.block_sparse_moe.experts.0.w3.weight": "model-00010-of-00016.safetensors", "model.layers.24.block_sparse_moe.experts.1.w3.weight": "model-00010-of-00016.safetensors", "model.layers.24.block_sparse_moe.experts.2.w3.weight": "model-00010-of-00016.safetensors", "model.layers.24.block_sparse_moe.experts.3.w3.weight": "model-00010-of-00016.safetensors", "model.layers.24.block_sparse_moe.experts.0.w1.weight": "model-00010-of-00016.safetensors", "model.layers.24.block_sparse_moe.experts.1.w1.weight": "model-00010-of-00016.safetensors", "model.layers.24.block_sparse_moe.experts.2.w1.weight": "model-00010-of-00016.safetensors", "model.layers.24.block_sparse_moe.experts.3.w1.weight": "model-00010-of-00016.safetensors", "model.layers.24.block_sparse_moe.experts.0.w2.weight": "model-00010-of-00016.safetensors", "model.layers.24.block_sparse_moe.experts.1.w2.weight": "model-00010-of-00016.safetensors", "model.layers.24.block_sparse_moe.experts.2.w2.weight": "model-00010-of-00016.safetensors", "model.layers.24.block_sparse_moe.experts.3.w2.weight": "model-00010-of-00016.safetensors", "model.layers.25.input_layernorm.weight": "model-00010-of-00016.safetensors", "model.layers.25.self_attn.q_proj.weight": "model-00010-of-00016.safetensors", "model.layers.25.self_attn.k_proj.weight": "model-00010-of-00016.safetensors", "model.layers.25.self_attn.v_proj.weight": "model-00010-of-00016.safetensors", "model.layers.25.self_attn.o_proj.weight": "model-00010-of-00016.safetensors", "model.layers.25.post_attention_layernorm.weight": "model-00010-of-00016.safetensors", "model.layers.25.block_sparse_moe.experts.0.w3.weight": "model-00010-of-00016.safetensors", "model.layers.25.block_sparse_moe.experts.1.w3.weight": "model-00010-of-00016.safetensors", "model.layers.25.block_sparse_moe.experts.2.w3.weight": "model-00010-of-00016.safetensors", "model.layers.25.block_sparse_moe.experts.3.w3.weight": "model-00010-of-00016.safetensors", "model.layers.25.block_sparse_moe.experts.0.w1.weight": "model-00010-of-00016.safetensors", "model.layers.25.block_sparse_moe.experts.1.w1.weight": "model-00010-of-00016.safetensors", "model.layers.25.block_sparse_moe.experts.2.w1.weight": "model-00011-of-00016.safetensors", "model.layers.25.block_sparse_moe.experts.3.w1.weight": "model-00011-of-00016.safetensors", "model.layers.25.block_sparse_moe.experts.0.w2.weight": "model-00011-of-00016.safetensors", "model.layers.25.block_sparse_moe.experts.1.w2.weight": "model-00011-of-00016.safetensors", "model.layers.25.block_sparse_moe.experts.2.w2.weight": "model-00011-of-00016.safetensors", "model.layers.25.block_sparse_moe.experts.3.w2.weight": "model-00011-of-00016.safetensors", "model.layers.26.input_layernorm.weight": "model-00011-of-00016.safetensors", "model.layers.26.self_attn.q_proj.weight": "model-00011-of-00016.safetensors", "model.layers.26.self_attn.k_proj.weight": "model-00011-of-00016.safetensors", "model.layers.26.self_attn.v_proj.weight": "model-00011-of-00016.safetensors", "model.layers.26.self_attn.o_proj.weight": "model-00011-of-00016.safetensors", "model.layers.26.post_attention_layernorm.weight": "model-00011-of-00016.safetensors", "model.layers.26.block_sparse_moe.experts.0.w3.weight": "model-00011-of-00016.safetensors", "model.layers.26.block_sparse_moe.experts.1.w3.weight": "model-00011-of-00016.safetensors", "model.layers.26.block_sparse_moe.experts.2.w3.weight": "model-00011-of-00016.safetensors", "model.layers.26.block_sparse_moe.experts.3.w3.weight": "model-00011-of-00016.safetensors", "model.layers.26.block_sparse_moe.experts.0.w1.weight": "model-00011-of-00016.safetensors", "model.layers.26.block_sparse_moe.experts.1.w1.weight": "model-00011-of-00016.safetensors", "model.layers.26.block_sparse_moe.experts.2.w1.weight": "model-00011-of-00016.safetensors", "model.layers.26.block_sparse_moe.experts.3.w1.weight": "model-00011-of-00016.safetensors", "model.layers.26.block_sparse_moe.experts.0.w2.weight": "model-00011-of-00016.safetensors", "model.layers.26.block_sparse_moe.experts.1.w2.weight": "model-00011-of-00016.safetensors", "model.layers.26.block_sparse_moe.experts.2.w2.weight": "model-00011-of-00016.safetensors", "model.layers.26.block_sparse_moe.experts.3.w2.weight": "model-00011-of-00016.safetensors", "model.layers.27.input_layernorm.weight": "model-00011-of-00016.safetensors", "model.layers.27.self_attn.q_proj.weight": "model-00011-of-00016.safetensors", "model.layers.27.self_attn.k_proj.weight": "model-00011-of-00016.safetensors", "model.layers.27.self_attn.v_proj.weight": "model-00011-of-00016.safetensors", "model.layers.27.self_attn.o_proj.weight": "model-00011-of-00016.safetensors", "model.layers.27.post_attention_layernorm.weight": "model-00011-of-00016.safetensors", "model.layers.27.block_sparse_moe.experts.0.w3.weight": "model-00011-of-00016.safetensors", "model.layers.27.block_sparse_moe.experts.1.w3.weight": "model-00011-of-00016.safetensors", "model.layers.27.block_sparse_moe.experts.2.w3.weight": "model-00011-of-00016.safetensors", "model.layers.27.block_sparse_moe.experts.3.w3.weight": "model-00011-of-00016.safetensors", "model.layers.27.block_sparse_moe.experts.0.w1.weight": "model-00011-of-00016.safetensors", "model.layers.27.block_sparse_moe.experts.1.w1.weight": "model-00011-of-00016.safetensors", "model.layers.27.block_sparse_moe.experts.2.w1.weight": "model-00011-of-00016.safetensors", "model.layers.27.block_sparse_moe.experts.3.w1.weight": "model-00011-of-00016.safetensors", "model.layers.27.block_sparse_moe.experts.0.w2.weight": "model-00011-of-00016.safetensors", "model.layers.27.block_sparse_moe.experts.1.w2.weight": "model-00011-of-00016.safetensors", "model.layers.27.block_sparse_moe.experts.2.w2.weight": "model-00011-of-00016.safetensors", "model.layers.27.block_sparse_moe.experts.3.w2.weight": "model-00011-of-00016.safetensors", "model.layers.28.input_layernorm.weight": "model-00011-of-00016.safetensors", "model.layers.28.self_attn.q_proj.weight": "model-00011-of-00016.safetensors", "model.layers.28.self_attn.k_proj.weight": "model-00011-of-00016.safetensors", "model.layers.28.self_attn.v_proj.weight": "model-00011-of-00016.safetensors", "model.layers.28.self_attn.o_proj.weight": "model-00011-of-00016.safetensors", "model.layers.28.post_attention_layernorm.weight": "model-00011-of-00016.safetensors", "model.layers.28.block_sparse_moe.experts.0.w3.weight": "model-00011-of-00016.safetensors", "model.layers.28.block_sparse_moe.experts.1.w3.weight": "model-00012-of-00016.safetensors", "model.layers.28.block_sparse_moe.experts.2.w3.weight": "model-00012-of-00016.safetensors", "model.layers.28.block_sparse_moe.experts.3.w3.weight": "model-00012-of-00016.safetensors", "model.layers.28.block_sparse_moe.experts.0.w1.weight": "model-00012-of-00016.safetensors", "model.layers.28.block_sparse_moe.experts.1.w1.weight": "model-00012-of-00016.safetensors", "model.layers.28.block_sparse_moe.experts.2.w1.weight": "model-00012-of-00016.safetensors", "model.layers.28.block_sparse_moe.experts.3.w1.weight": "model-00012-of-00016.safetensors", "model.layers.28.block_sparse_moe.experts.0.w2.weight": "model-00012-of-00016.safetensors", "model.layers.28.block_sparse_moe.experts.1.w2.weight": "model-00012-of-00016.safetensors", "model.layers.28.block_sparse_moe.experts.2.w2.weight": "model-00012-of-00016.safetensors", "model.layers.28.block_sparse_moe.experts.3.w2.weight": "model-00012-of-00016.safetensors", "model.layers.29.input_layernorm.weight": "model-00012-of-00016.safetensors", "model.layers.29.self_attn.q_proj.weight": "model-00012-of-00016.safetensors", "model.layers.29.self_attn.k_proj.weight": "model-00012-of-00016.safetensors", "model.layers.29.self_attn.v_proj.weight": "model-00012-of-00016.safetensors", "model.layers.29.self_attn.o_proj.weight": "model-00012-of-00016.safetensors", "model.layers.29.post_attention_layernorm.weight": "model-00012-of-00016.safetensors", "model.layers.29.block_sparse_moe.experts.0.w3.weight": "model-00012-of-00016.safetensors", "model.layers.29.block_sparse_moe.experts.1.w3.weight": "model-00012-of-00016.safetensors", "model.layers.29.block_sparse_moe.experts.2.w3.weight": "model-00012-of-00016.safetensors", "model.layers.29.block_sparse_moe.experts.3.w3.weight": "model-00012-of-00016.safetensors", "model.layers.29.block_sparse_moe.experts.0.w1.weight": "model-00012-of-00016.safetensors", "model.layers.29.block_sparse_moe.experts.1.w1.weight": "model-00012-of-00016.safetensors", "model.layers.29.block_sparse_moe.experts.2.w1.weight": "model-00012-of-00016.safetensors", "model.layers.29.block_sparse_moe.experts.3.w1.weight": "model-00012-of-00016.safetensors", "model.layers.29.block_sparse_moe.experts.0.w2.weight": "model-00012-of-00016.safetensors", "model.layers.29.block_sparse_moe.experts.1.w2.weight": "model-00012-of-00016.safetensors", "model.layers.29.block_sparse_moe.experts.2.w2.weight": "model-00012-of-00016.safetensors", "model.layers.29.block_sparse_moe.experts.3.w2.weight": "model-00012-of-00016.safetensors", "model.layers.30.input_layernorm.weight": "model-00012-of-00016.safetensors", "model.layers.30.self_attn.q_proj.weight": "model-00012-of-00016.safetensors", "model.layers.30.self_attn.k_proj.weight": "model-00012-of-00016.safetensors", "model.layers.30.self_attn.v_proj.weight": "model-00012-of-00016.safetensors", "model.layers.30.self_attn.o_proj.weight": "model-00012-of-00016.safetensors", "model.layers.30.post_attention_layernorm.weight": "model-00012-of-00016.safetensors", "model.layers.30.block_sparse_moe.experts.0.w3.weight": "model-00012-of-00016.safetensors", "model.layers.30.block_sparse_moe.experts.1.w3.weight": "model-00012-of-00016.safetensors", "model.layers.30.block_sparse_moe.experts.2.w3.weight": "model-00012-of-00016.safetensors", "model.layers.30.block_sparse_moe.experts.3.w3.weight": "model-00012-of-00016.safetensors", "model.layers.30.block_sparse_moe.experts.0.w1.weight": "model-00012-of-00016.safetensors", "model.layers.30.block_sparse_moe.experts.1.w1.weight": "model-00012-of-00016.safetensors", "model.layers.30.block_sparse_moe.experts.2.w1.weight": "model-00012-of-00016.safetensors", "model.layers.30.block_sparse_moe.experts.3.w1.weight": "model-00012-of-00016.safetensors", "model.layers.30.block_sparse_moe.experts.0.w2.weight": "model-00012-of-00016.safetensors", "model.layers.30.block_sparse_moe.experts.1.w2.weight": "model-00013-of-00016.safetensors", "model.layers.30.block_sparse_moe.experts.2.w2.weight": "model-00013-of-00016.safetensors", "model.layers.30.block_sparse_moe.experts.3.w2.weight": "model-00013-of-00016.safetensors", "model.layers.31.input_layernorm.weight": "model-00013-of-00016.safetensors", "model.layers.31.self_attn.q_proj.weight": "model-00013-of-00016.safetensors", "model.layers.31.self_attn.k_proj.weight": "model-00013-of-00016.safetensors", "model.layers.31.self_attn.v_proj.weight": "model-00013-of-00016.safetensors", "model.layers.31.self_attn.o_proj.weight": "model-00013-of-00016.safetensors", "model.layers.31.post_attention_layernorm.weight": "model-00013-of-00016.safetensors", "model.layers.31.block_sparse_moe.experts.0.w3.weight": "model-00013-of-00016.safetensors", "model.layers.31.block_sparse_moe.experts.1.w3.weight": "model-00013-of-00016.safetensors", "model.layers.31.block_sparse_moe.experts.2.w3.weight": "model-00013-of-00016.safetensors", "model.layers.31.block_sparse_moe.experts.3.w3.weight": "model-00013-of-00016.safetensors", "model.layers.31.block_sparse_moe.experts.0.w1.weight": "model-00013-of-00016.safetensors", "model.layers.31.block_sparse_moe.experts.1.w1.weight": "model-00013-of-00016.safetensors", "model.layers.31.block_sparse_moe.experts.2.w1.weight": "model-00013-of-00016.safetensors", "model.layers.31.block_sparse_moe.experts.3.w1.weight": "model-00013-of-00016.safetensors", "model.layers.31.block_sparse_moe.experts.0.w2.weight": "model-00013-of-00016.safetensors", "model.layers.31.block_sparse_moe.experts.1.w2.weight": "model-00013-of-00016.safetensors", "model.layers.31.block_sparse_moe.experts.2.w2.weight": "model-00013-of-00016.safetensors", "model.layers.31.block_sparse_moe.experts.3.w2.weight": "model-00013-of-00016.safetensors", "model.layers.32.input_layernorm.weight": "model-00013-of-00016.safetensors", "model.layers.32.self_attn.q_proj.weight": "model-00013-of-00016.safetensors", "model.layers.32.self_attn.k_proj.weight": "model-00013-of-00016.safetensors", "model.layers.32.self_attn.v_proj.weight": "model-00013-of-00016.safetensors", "model.layers.32.self_attn.o_proj.weight": "model-00013-of-00016.safetensors", "model.layers.32.post_attention_layernorm.weight": "model-00013-of-00016.safetensors", "model.layers.32.block_sparse_moe.experts.0.w3.weight": "model-00013-of-00016.safetensors", "model.layers.32.block_sparse_moe.experts.1.w3.weight": "model-00013-of-00016.safetensors", "model.layers.32.block_sparse_moe.experts.2.w3.weight": "model-00013-of-00016.safetensors", "model.layers.32.block_sparse_moe.experts.3.w3.weight": "model-00013-of-00016.safetensors", "model.layers.32.block_sparse_moe.experts.0.w1.weight": "model-00013-of-00016.safetensors", "model.layers.32.block_sparse_moe.experts.1.w1.weight": "model-00013-of-00016.safetensors", "model.layers.32.block_sparse_moe.experts.2.w1.weight": "model-00013-of-00016.safetensors", "model.layers.32.block_sparse_moe.experts.3.w1.weight": "model-00013-of-00016.safetensors", "model.layers.32.block_sparse_moe.experts.0.w2.weight": "model-00013-of-00016.safetensors", "model.layers.32.block_sparse_moe.experts.1.w2.weight": "model-00013-of-00016.safetensors", "model.layers.32.block_sparse_moe.experts.2.w2.weight": "model-00013-of-00016.safetensors", "model.layers.32.block_sparse_moe.experts.3.w2.weight": "model-00013-of-00016.safetensors", "model.layers.33.input_layernorm.weight": "model-00013-of-00016.safetensors", "model.layers.33.self_attn.q_proj.weight": "model-00013-of-00016.safetensors", "model.layers.33.self_attn.k_proj.weight": "model-00013-of-00016.safetensors", "model.layers.33.self_attn.v_proj.weight": "model-00013-of-00016.safetensors", "model.layers.33.self_attn.o_proj.weight": "model-00013-of-00016.safetensors", "model.layers.33.post_attention_layernorm.weight": "model-00013-of-00016.safetensors", "model.layers.33.block_sparse_moe.experts.0.w3.weight": "model-00013-of-00016.safetensors", "model.layers.33.block_sparse_moe.experts.1.w3.weight": "model-00013-of-00016.safetensors", "model.layers.33.block_sparse_moe.experts.2.w3.weight": "model-00013-of-00016.safetensors", "model.layers.33.block_sparse_moe.experts.3.w3.weight": "model-00013-of-00016.safetensors", "model.layers.33.block_sparse_moe.experts.0.w1.weight": "model-00014-of-00016.safetensors", "model.layers.33.block_sparse_moe.experts.1.w1.weight": "model-00014-of-00016.safetensors", "model.layers.33.block_sparse_moe.experts.2.w1.weight": "model-00014-of-00016.safetensors", "model.layers.33.block_sparse_moe.experts.3.w1.weight": "model-00014-of-00016.safetensors", "model.layers.33.block_sparse_moe.experts.0.w2.weight": "model-00014-of-00016.safetensors", "model.layers.33.block_sparse_moe.experts.1.w2.weight": "model-00014-of-00016.safetensors", "model.layers.33.block_sparse_moe.experts.2.w2.weight": "model-00014-of-00016.safetensors", "model.layers.33.block_sparse_moe.experts.3.w2.weight": "model-00014-of-00016.safetensors", "model.layers.34.input_layernorm.weight": "model-00014-of-00016.safetensors", "model.layers.34.self_attn.q_proj.weight": "model-00014-of-00016.safetensors", "model.layers.34.self_attn.k_proj.weight": "model-00014-of-00016.safetensors", "model.layers.34.self_attn.v_proj.weight": "model-00014-of-00016.safetensors", "model.layers.34.self_attn.o_proj.weight": "model-00014-of-00016.safetensors", "model.layers.34.post_attention_layernorm.weight": "model-00014-of-00016.safetensors", "model.layers.34.block_sparse_moe.experts.0.w3.weight": "model-00014-of-00016.safetensors", "model.layers.34.block_sparse_moe.experts.1.w3.weight": "model-00014-of-00016.safetensors", "model.layers.34.block_sparse_moe.experts.2.w3.weight": "model-00014-of-00016.safetensors", "model.layers.34.block_sparse_moe.experts.3.w3.weight": "model-00014-of-00016.safetensors", "model.layers.34.block_sparse_moe.experts.0.w1.weight": "model-00014-of-00016.safetensors", "model.layers.34.block_sparse_moe.experts.1.w1.weight": "model-00014-of-00016.safetensors", "model.layers.34.block_sparse_moe.experts.2.w1.weight": "model-00014-of-00016.safetensors", "model.layers.34.block_sparse_moe.experts.3.w1.weight": "model-00014-of-00016.safetensors", "model.layers.34.block_sparse_moe.experts.0.w2.weight": "model-00014-of-00016.safetensors", "model.layers.34.block_sparse_moe.experts.1.w2.weight": "model-00014-of-00016.safetensors", "model.layers.34.block_sparse_moe.experts.2.w2.weight": "model-00014-of-00016.safetensors", "model.layers.34.block_sparse_moe.experts.3.w2.weight": "model-00014-of-00016.safetensors", "model.layers.35.input_layernorm.weight": "model-00014-of-00016.safetensors", "model.layers.35.self_attn.q_proj.weight": "model-00014-of-00016.safetensors", "model.layers.35.self_attn.k_proj.weight": "model-00014-of-00016.safetensors", "model.layers.35.self_attn.v_proj.weight": "model-00014-of-00016.safetensors", "model.layers.35.self_attn.o_proj.weight": "model-00014-of-00016.safetensors", "model.layers.35.post_attention_layernorm.weight": "model-00014-of-00016.safetensors", "model.layers.35.block_sparse_moe.experts.0.w3.weight": "model-00014-of-00016.safetensors", "model.layers.35.block_sparse_moe.experts.1.w3.weight": "model-00014-of-00016.safetensors", "model.layers.35.block_sparse_moe.experts.2.w3.weight": "model-00014-of-00016.safetensors", "model.layers.35.block_sparse_moe.experts.3.w3.weight": "model-00014-of-00016.safetensors", "model.layers.35.block_sparse_moe.experts.0.w1.weight": "model-00014-of-00016.safetensors", "model.layers.35.block_sparse_moe.experts.1.w1.weight": "model-00014-of-00016.safetensors", "model.layers.35.block_sparse_moe.experts.2.w1.weight": "model-00014-of-00016.safetensors", "model.layers.35.block_sparse_moe.experts.3.w1.weight": "model-00014-of-00016.safetensors", "model.layers.35.block_sparse_moe.experts.0.w2.weight": "model-00014-of-00016.safetensors", "model.layers.35.block_sparse_moe.experts.1.w2.weight": "model-00014-of-00016.safetensors", "model.layers.35.block_sparse_moe.experts.2.w2.weight": "model-00014-of-00016.safetensors", "model.layers.35.block_sparse_moe.experts.3.w2.weight": "model-00014-of-00016.safetensors", "model.layers.36.input_layernorm.weight": "model-00014-of-00016.safetensors", "model.layers.36.self_attn.q_proj.weight": "model-00014-of-00016.safetensors", "model.layers.36.self_attn.k_proj.weight": "model-00014-of-00016.safetensors", "model.layers.36.self_attn.v_proj.weight": "model-00014-of-00016.safetensors", "model.layers.36.self_attn.o_proj.weight": "model-00015-of-00016.safetensors", "model.layers.36.post_attention_layernorm.weight": "model-00015-of-00016.safetensors", "model.layers.36.block_sparse_moe.experts.0.w3.weight": "model-00015-of-00016.safetensors", "model.layers.36.block_sparse_moe.experts.1.w3.weight": "model-00015-of-00016.safetensors", "model.layers.36.block_sparse_moe.experts.2.w3.weight": "model-00015-of-00016.safetensors", "model.layers.36.block_sparse_moe.experts.3.w3.weight": "model-00015-of-00016.safetensors", "model.layers.36.block_sparse_moe.experts.0.w1.weight": "model-00015-of-00016.safetensors", "model.layers.36.block_sparse_moe.experts.1.w1.weight": "model-00015-of-00016.safetensors", "model.layers.36.block_sparse_moe.experts.2.w1.weight": "model-00015-of-00016.safetensors", "model.layers.36.block_sparse_moe.experts.3.w1.weight": "model-00015-of-00016.safetensors", "model.layers.36.block_sparse_moe.experts.0.w2.weight": "model-00015-of-00016.safetensors", "model.layers.36.block_sparse_moe.experts.1.w2.weight": "model-00015-of-00016.safetensors", "model.layers.36.block_sparse_moe.experts.2.w2.weight": "model-00015-of-00016.safetensors", "model.layers.36.block_sparse_moe.experts.3.w2.weight": "model-00015-of-00016.safetensors", "model.layers.37.input_layernorm.weight": "model-00015-of-00016.safetensors", "model.layers.37.self_attn.q_proj.weight": "model-00015-of-00016.safetensors", "model.layers.37.self_attn.k_proj.weight": "model-00015-of-00016.safetensors", "model.layers.37.self_attn.v_proj.weight": "model-00015-of-00016.safetensors", "model.layers.37.self_attn.o_proj.weight": "model-00015-of-00016.safetensors", "model.layers.37.post_attention_layernorm.weight": "model-00015-of-00016.safetensors", "model.layers.37.block_sparse_moe.experts.0.w3.weight": "model-00015-of-00016.safetensors", "model.layers.37.block_sparse_moe.experts.1.w3.weight": "model-00015-of-00016.safetensors", "model.layers.37.block_sparse_moe.experts.2.w3.weight": "model-00015-of-00016.safetensors", "model.layers.37.block_sparse_moe.experts.3.w3.weight": "model-00015-of-00016.safetensors", "model.layers.37.block_sparse_moe.experts.0.w1.weight": "model-00015-of-00016.safetensors", "model.layers.37.block_sparse_moe.experts.1.w1.weight": "model-00015-of-00016.safetensors", "model.layers.37.block_sparse_moe.experts.2.w1.weight": "model-00015-of-00016.safetensors", "model.layers.37.block_sparse_moe.experts.3.w1.weight": "model-00015-of-00016.safetensors", "model.layers.37.block_sparse_moe.experts.0.w2.weight": "model-00015-of-00016.safetensors", "model.layers.37.block_sparse_moe.experts.1.w2.weight": "model-00015-of-00016.safetensors", "model.layers.37.block_sparse_moe.experts.2.w2.weight": "model-00015-of-00016.safetensors", "model.layers.37.block_sparse_moe.experts.3.w2.weight": "model-00015-of-00016.safetensors", "model.layers.38.input_layernorm.weight": "model-00015-of-00016.safetensors", "model.layers.38.self_attn.q_proj.weight": "model-00015-of-00016.safetensors", "model.layers.38.self_attn.k_proj.weight": "model-00015-of-00016.safetensors", "model.layers.38.self_attn.v_proj.weight": "model-00015-of-00016.safetensors", "model.layers.38.self_attn.o_proj.weight": "model-00015-of-00016.safetensors", "model.layers.38.post_attention_layernorm.weight": "model-00015-of-00016.safetensors", "model.layers.38.block_sparse_moe.experts.0.w3.weight": "model-00015-of-00016.safetensors", "model.layers.38.block_sparse_moe.experts.1.w3.weight": "model-00015-of-00016.safetensors", "model.layers.38.block_sparse_moe.experts.2.w3.weight": "model-00015-of-00016.safetensors", "model.layers.38.block_sparse_moe.experts.3.w3.weight": "model-00015-of-00016.safetensors", "model.layers.38.block_sparse_moe.experts.0.w1.weight": "model-00015-of-00016.safetensors", "model.layers.38.block_sparse_moe.experts.1.w1.weight": "model-00015-of-00016.safetensors", "model.layers.38.block_sparse_moe.experts.2.w1.weight": "model-00015-of-00016.safetensors", "model.layers.38.block_sparse_moe.experts.3.w1.weight": "model-00015-of-00016.safetensors", "model.layers.38.block_sparse_moe.experts.0.w2.weight": "model-00016-of-00016.safetensors", "model.layers.38.block_sparse_moe.experts.1.w2.weight": "model-00016-of-00016.safetensors", "model.layers.38.block_sparse_moe.experts.2.w2.weight": "model-00016-of-00016.safetensors", "model.layers.38.block_sparse_moe.experts.3.w2.weight": "model-00016-of-00016.safetensors", "model.layers.39.input_layernorm.weight": "model-00016-of-00016.safetensors", "model.layers.39.self_attn.q_proj.weight": "model-00016-of-00016.safetensors", "model.layers.39.self_attn.k_proj.weight": "model-00016-of-00016.safetensors", "model.layers.39.self_attn.v_proj.weight": "model-00016-of-00016.safetensors", "model.layers.39.self_attn.o_proj.weight": "model-00016-of-00016.safetensors", "model.layers.39.post_attention_layernorm.weight": "model-00016-of-00016.safetensors", "model.layers.39.block_sparse_moe.experts.0.w3.weight": "model-00016-of-00016.safetensors", "model.layers.39.block_sparse_moe.experts.1.w3.weight": "model-00016-of-00016.safetensors", "model.layers.39.block_sparse_moe.experts.2.w3.weight": "model-00016-of-00016.safetensors", "model.layers.39.block_sparse_moe.experts.3.w3.weight": "model-00016-of-00016.safetensors", "model.layers.39.block_sparse_moe.experts.0.w1.weight": "model-00016-of-00016.safetensors", "model.layers.39.block_sparse_moe.experts.1.w1.weight": "model-00016-of-00016.safetensors", "model.layers.39.block_sparse_moe.experts.2.w1.weight": "model-00016-of-00016.safetensors", "model.layers.39.block_sparse_moe.experts.3.w1.weight": "model-00016-of-00016.safetensors", "model.layers.39.block_sparse_moe.experts.0.w2.weight": "model-00016-of-00016.safetensors", "model.layers.39.block_sparse_moe.experts.1.w2.weight": "model-00016-of-00016.safetensors", "model.layers.39.block_sparse_moe.experts.2.w2.weight": "model-00016-of-00016.safetensors", "model.layers.39.block_sparse_moe.experts.3.w2.weight": "model-00016-of-00016.safetensors", "model.norm.weight": "model-00016-of-00016.safetensors", "lm_head.weight": "model-00016-of-00016.safetensors", "model.layers.0.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.1.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.2.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.3.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.4.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.5.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.6.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.7.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.8.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.9.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.10.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.11.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.12.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.13.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.14.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.15.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.16.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.17.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.18.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.19.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.20.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.21.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.22.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.23.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.24.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.25.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.26.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.27.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.28.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.29.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.30.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.31.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.32.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.33.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.34.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.35.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.36.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.37.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.38.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors", "model.layers.39.block_sparse_moe.gate.weight": "model-00016-of-00016.safetensors"}}
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|im_end|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffccdd8584d7e186a6714a5a3341158cee7db88c5f1d3f707785f7b84a43bca1
3
+ size 17078463
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff