silence09 commited on
Commit
6da791b
·
verified ·
1 Parent(s): f937d71

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ base_model:
4
+ - deepseek-ai/DeepSeek-R1
5
+ ---
6
+ # LightWeight Deepseek R1 (2 Hidden Layers Version with Smaller Dimensions)
7
+
8
+ This project is created using the official **Deepseek R1** model script (`modeling_deepseek.py`) from [Hugging Face](https://huggingface.co/deepseek-ai/DeepSeek-R1/blob/main/modeling_deepseek.py). It implements a **2-layer version** of Deepseek R1 with randomly initialized weights and smaller dimensions.
9
+
10
+ ## Model Structure
11
+ The three hidden layers consist of:
12
+ - **A hidden layer: MLA + Dense MLP**
13
+ - **A hidden layer: MLA + MoE (Mixture of Experts) MLP**
14
+
15
+ The difference between this model and the original **Deepseek R1** is shown below:
16
+ ```json
17
+ {
18
+ "first_k_dense_replace": 1,
19
+ "intermediate_size": 1024,
20
+ "n_routed_experts": 64,
21
+ "num_experts_per_tok": 4,
22
+ "moe_intermediate_size": 128,
23
+ "num_hidden_layers": 2,
24
+ "num_nextn_predict_layers": 0
25
+ }
26
+ ```
27
+
28
+ ## Purpose
29
+ The purpose of these weights is to provide a lightweight implementation for researchers who want to study the model architecture and run experiments quickly.
30
+
31
+ The original **Deepseek R1 model** requires an **8x H200 GPU setup** and runs on the **vLLM/SGLang framework**, making it difficult to deploy on standard hardware.
32
+
33
+ ## Usage
34
+
35
+ ```python
36
+ from transformers import AutoConfig, AutoModelForCausalLM
37
+ from transformers import AutoTokenizer
38
+ import torch
39
+
40
+ model = AutoModelForCausalLM.from_pretrained('silence09/DeepSeek-R1-Small-2layers', torch_dtype=torch.bfloat16).cuda()
41
+ tokenizer = AutoTokenizer.from_pretrained('silence09/DeepSeek-R1-Small-2layers')
42
+
43
+ prompt = "Who are u?"
44
+ messages = []
45
+ messages.append({"role": "user", "content": prompt})
46
+ prompt_tokens = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)
47
+ generated_ids = model.generate(prompt_tokens, max_new_tokens=100, do_sample=False)
48
+ generated_ids = [
49
+ output_ids[len(input_ids):] for input_ids, output_ids in zip(prompt_tokens, generated_ids)
50
+ ]
51
+ completion = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
52
+ print(completion)
53
+ messages.append({"role": "assistant", "content": completion})
54
+
55
+ ```
56
+
57
+ ## More Info
58
+ It was created using the python script available at [this repository](https://github.com/silencelamb/naked_llama/blob/main/hf_example/create_deepseek_r1_small_2layers.py)
config.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "deepseek-ai/DeepSeek-R1",
3
+ "architectures": [
4
+ "DeepseekV3ForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "auto_map": {
9
+ "AutoConfig": "deepseek-ai/DeepSeek-R1--configuration_deepseek.DeepseekV3Config",
10
+ "AutoModel": "deepseek-ai/DeepSeek-R1--modeling_deepseek.DeepseekV3Model",
11
+ "AutoModelForCausalLM": "deepseek-ai/DeepSeek-R1--modeling_deepseek.DeepseekV3ForCausalLM"
12
+ },
13
+ "aux_loss_alpha": 0.001,
14
+ "bos_token_id": 0,
15
+ "eos_token_id": 1,
16
+ "ep_size": 1,
17
+ "first_k_dense_replace": 1,
18
+ "hidden_act": "silu",
19
+ "hidden_size": 7168,
20
+ "initializer_range": 0.02,
21
+ "intermediate_size": 1024,
22
+ "kv_lora_rank": 512,
23
+ "max_position_embeddings": 163840,
24
+ "model_type": "deepseek_v3",
25
+ "moe_intermediate_size": 128,
26
+ "moe_layer_freq": 1,
27
+ "n_group": 8,
28
+ "n_routed_experts": 64,
29
+ "n_shared_experts": 1,
30
+ "norm_topk_prob": true,
31
+ "num_attention_heads": 128,
32
+ "num_experts_per_tok": 4,
33
+ "num_hidden_layers": 2,
34
+ "num_key_value_heads": 128,
35
+ "num_nextn_predict_layers": 0,
36
+ "pretraining_tp": 1,
37
+ "q_lora_rank": 1536,
38
+ "qk_nope_head_dim": 128,
39
+ "qk_rope_head_dim": 64,
40
+ "rms_norm_eps": 1e-06,
41
+ "rope_scaling": {
42
+ "beta_fast": 32,
43
+ "beta_slow": 1,
44
+ "factor": 40,
45
+ "mscale": 1.0,
46
+ "mscale_all_dim": 1.0,
47
+ "original_max_position_embeddings": 4096,
48
+ "type": "yarn"
49
+ },
50
+ "rope_theta": 10000,
51
+ "routed_scaling_factor": 2.5,
52
+ "scoring_func": "sigmoid",
53
+ "seq_aux": true,
54
+ "tie_word_embeddings": false,
55
+ "topk_group": 4,
56
+ "topk_method": "noaux_tc",
57
+ "torch_dtype": "bfloat16",
58
+ "transformers_version": "4.48.2",
59
+ "use_cache": true,
60
+ "v_head_dim": 128,
61
+ "vocab_size": 129280
62
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "transformers_version": "4.48.2"
6
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:308c09513f65a9c3808909ddcdfa0e719a0849cf46d0ba2a36208ce581b999fc
3
+ size 4858028608
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|begin▁of▁sentence|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|end▁of▁sentence|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|end▁of▁sentence|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff