jingyaogong commited on
Commit
6d52f6f
·
verified ·
1 Parent(s): 92c1291

Upload 10 files

Browse files
LMConfig.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+ from typing import List
3
+
4
+
5
+ class LMConfig(PretrainedConfig):
6
+ model_type = "minimind"
7
+
8
+ def __init__(
9
+ self,
10
+ dim: int = 512,
11
+ n_layers: int = 8,
12
+ n_heads: int = 16,
13
+ n_kv_heads: int = 8,
14
+ vocab_size: int = 6400,
15
+ hidden_dim: int = None,
16
+ multiple_of: int = 64,
17
+ norm_eps: float = 1e-5,
18
+ max_seq_len: int = 512,
19
+ dropout: float = 0.0,
20
+ flash_attn: bool = True,
21
+ ####################################################
22
+ # Here are the specific configurations of MOE
23
+ # When use_moe is false, the following is invalid
24
+ ####################################################
25
+ use_moe: bool = True,
26
+ num_experts_per_tok=2,
27
+ n_routed_experts=4,
28
+ n_shared_experts: bool = True,
29
+ scoring_func='softmax',
30
+ aux_loss_alpha=0.01,
31
+ seq_aux=True,
32
+ norm_topk_prob=True,
33
+ **kwargs,
34
+ ):
35
+ self.dim = dim
36
+ self.n_layers = n_layers
37
+ self.n_heads = n_heads
38
+ self.n_kv_heads = n_kv_heads
39
+ self.vocab_size = vocab_size
40
+ self.hidden_dim = hidden_dim
41
+ self.multiple_of = multiple_of
42
+ self.norm_eps = norm_eps
43
+ self.max_seq_len = max_seq_len
44
+ self.dropout = dropout
45
+ self.flash_attn = flash_attn
46
+ ####################################################
47
+ # Here are the specific configurations of MOE
48
+ # When use_moe is false, the following is invalid
49
+ ####################################################
50
+ self.use_moe = use_moe
51
+ self.num_experts_per_tok = num_experts_per_tok # 每个token选择的专家数量
52
+ self.n_routed_experts = n_routed_experts # 总的专家数量
53
+ self.n_shared_experts = n_shared_experts # 共享专家
54
+ self.scoring_func = scoring_func # 评分函数,默认为'softmax'
55
+ self.aux_loss_alpha = aux_loss_alpha # 辅助损失的alpha参数
56
+ self.seq_aux = seq_aux # 是否在序列级别上计算辅助损失
57
+ self.norm_topk_prob = norm_topk_prob # 是否标准化top-k概率
58
+ super().__init__(**kwargs)
README.md CHANGED
@@ -57,7 +57,7 @@ https://github.com/user-attachments/assets/88b98128-636e-43bc-a419-b1b1403c2055
57
 
58
  | 模型 (大小) | tokenizer长度 | 推理占用 | release | 主观评分(/100) |
59
  |-------------------------|-------------|--------|------------|------------|
60
- | minimind-v1-small (26M) | 6400 | 0.5 GB | 2024.08.28 | 50' |
61
  | minimind-v1-moe (4×26M) | 6400 | 1.0 GB | 2024.09.17 | 55' |
62
  | minimind-v1 (108M) | 6400 | 1.0 GB | 2024.09.01 | 60' |
63
 
@@ -320,12 +320,11 @@ MiniMind的整体结构一致,只是在RoPE计算、推理函数和FFN层的
320
  修改模型配置见[./model/LMConfig.py](./model/LMConfig.py)。
321
  minimind目前训练的模型版本见下表:
322
 
323
- | Model Name | params | len_vocab | n_layers | d_model | kv_heads | q_heads | share+route | TopK |
324
- |------------------|--------|-----------|----------|---------|----------|---------|-------------|------|
325
  | minimind-v1-small | 26M | 6400 | 8 | 512 | 8 | 16 | - | - |
326
- | minimind-v1-moe | 4×26M | 6400 | 8 | 512 | 8 | 16 | 2+4 | 2 |
327
- | minimind-v1 | 108M | 6400 | 16 | 768 | 8 | 16 | - | - |
328
-
329
 
330
  # 📌 Experiment
331
 
@@ -336,11 +335,11 @@ CPU: Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz
336
  环境:python 3.9 + Torch 2.1.2 + DDP多卡训练
337
  ```
338
 
339
- | Model Name | params | len_vocab | batch_size | pretrain_time | sft_single_time | sft_multi_time |
340
- |------------------|--------|-----------|------------|-------------------|-------------------|---------------------|
341
  | minimind-v1-small | 26M | 6400 | 64 | ≈2 hour (1 epoch) | ≈2 hour (1 epoch) | ≈0.5 hour (1 epoch) |
342
- | minimind-v1-moe | 4×26M | 6400 | 40 | ≈6 hour (1 epoch) | ≈5 hour (1 epoch) | ≈1 hour (1 epoch) |
343
- | minimind-v1 | 108M | 6400 | 16 | ≈6 hour (1 epoch) | ≈4 hour (1 epoch) | ≈1 hour (1 epoch) |
344
 
345
  ---
346
 
@@ -382,6 +381,7 @@ CPU: Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz
382
  ```bash
383
  python 5-dpo_train.py
384
  ```
 
385
  ---
386
 
387
  📋关于LLM的参数配置,有一篇很有意思的论文[MobileLLM](https://arxiv.org/pdf/2402.14905)做了详细的研究和实验。
@@ -410,16 +410,18 @@ MobileLLM提出架构的深度比宽度更重要,「深而窄」的「瘦长
410
  ![gpt3_config.png](./images/gpt3_config.png)
411
 
412
  ---
 
413
  ### 训练完成的模型权重
414
 
415
- | Model Name | params | Config | pretrain_model | single_sft_model | multi_sft_model |
416
- |-------------------|--------|-----------------------------|----------------|----------------------------------------------------------------|----------------------------------------------------------------|
417
- | minimind-v1-small | 26M | d_model=512<br/>n_layers=8 | - | [链接](https://pan.baidu.com/s/1_COe0FQRDmeapSsvArahCA?pwd=6666) | [链接](https://pan.baidu.com/s/1GsGsWSL0Dckl0YPRXiBIFQ?pwd=6666) |
418
- | minimind-v1-moe | 4×26M | d_model=512<br/>n_layers=8 | - | - | - |
419
- | minimind-v1 | 108M | d_model=768<br/>n_layers=16 | - | [链接](https://pan.baidu.com/s/1p713loS7EfwHQf3G9eYI3Q?pwd=6666) | [链接](https://pan.baidu.com/s/12iHGpAs6R0kqsOnGtgK6vQ?pwd=6666) |
420
 
421
- ---
 
 
 
 
422
 
 
423
 
424
  # 📌 Eval
425
 
@@ -427,7 +429,7 @@ MobileLLM提出架构的深度比宽度更重要,「深而窄」的「瘦长
427
  > 以下测试于2024-09-17完成,此日期后发布的新模型,无特殊需要时将不加入测试。
428
 
429
  [A] [minimind-v1-small(0.02B)](https://pan.baidu.com/s/1_COe0FQRDmeapSsvArahCA?pwd=6666)<br/>
430
- [B] minimind-v1-moe(0.1B)<br/>
431
  [C] [minimind-v1(0.1B)](https://pan.baidu.com/s/1p713loS7EfwHQf3G9eYI3Q?pwd=6666)<br/>
432
  [D] [baby-llama2-chinese(0.2B)](https://github.com/DLLXW/baby-llama2-chinese)<br/>
433
  [E] [chatlm-mini-chinese(0.2B)](https://github.com/charent/ChatLM-mini-Chinese)<br/>
@@ -435,7 +437,7 @@ MobileLLM提出架构的深度比宽度更重要,「深而窄」的「瘦长
435
  ```text
436
  [Q]: 你知道长江吗?
437
  [A]: 是的,长江是中国最长的河流,全长超过3300千米。
438
- [B]: 是的,长江是中国最长的河流,全长约7100公里。它的流域面积约为3328万平方公里,是中国最大的河流之一,也是世界第三大河流,总长度约为6300千米。
439
  [C]: 是的,长江是中国最长的河流之一,也是世界第三大河流。长江发源于青藏高原,流经11个省份,最终注入东海。长江流域是中国最大的河流流域之一,拥有着丰富的自然资源和文化遗产,如长江大桥、长江海洋博物馆、长江古城等。
440
  [D]: 同时也知道,中国是世界上最长的城市之一,拥有超过3400万人口。
441
  [E]: 是的,长江是中国最长的河流,也是世界上最长的河流之一。它发源于青藏高原,流经九个省区,最终注入东海。长江流域的气候温和,四季分明,是中国重要的农业和工业基地。长江流域的水产资源丰富,是中国的重要渔业资源之一。
@@ -443,7 +445,7 @@ MobileLLM提出架构的深度比宽度更重要,「深而窄」的「瘦长
443
 
444
  [Q]: 第一颗人造卫星是哪个国家发射的?
445
  [A]: 第一颗人造卫星是斯普特尼克1号。
446
- [B]: 第一颗人造卫星是美国发射的。
447
  [C]: 第一颗人造卫星是苏联发射的,它是由苏联于1957年10月发射的。
448
  [D]: 这颗名为“阿姆斯特朗”的卫星,于1961年11月20日发射,是第一个成功将人类送入太空的卫星。
449
  [E]: 第一颗人造卫星是苏联于1957年4月4日发射的,它是由苏联宇航员尤里·加加林(Yuri Gagarin)和苏联工程师罗尔德·谢尔盖耶夫(Roald Skohley)共同发射的。在发射过程中,加加林和谢尔盖耶夫使用了一个名为“斯普特尼克1号”的小型卫星,它的直径为4.3米,重量为1,051公斤。这枚卫星被称为“斯普特尼克1号”,是苏联太空竞赛中的一个里程碑,也是人类历史上第一次在太空中进行太空行走。
@@ -451,15 +453,16 @@ MobileLLM提出架构的深度比宽度更重要,「深而窄」的「瘦长
451
 
452
  [Q]: 你知道泰山在哪里吗?
453
  [A]: 泰山位于中国东部地区,是中国五岳之一。
454
- [B]: 泰山位于山东省泰安市,是中国五岳之一。
455
  [C]: 泰山位于中国山东省泰安市,是中国五岳之一。
456
  [D]: 作为一个人工智能助手,我无法访问互联网,因此无法提供关于你所在位置的信息。不过,你可以通过搜索或搜索相关关键词来获取有关你所在位置的信息,或者查看相关新闻或网站来获取有关你所在位置的信息。
457
  [E]: 泰山位于中国山东省泰安市境内,是中国五岳之一,也是中国文化和历史���重要象征之一。它是中国五岳之一,也是中国道教和佛教的发源地之一。泰山的海拔高度为1,545米,是中国五岳之一,也是中国最著名的旅游景点之一。它的历史可以追溯到公元前5世纪,是中国古代帝王祭祀的场所。泰山也是中国文化的重要组成部分,被誉为“五岳之首”,是中国文化的重要遗产之一。
 
458
 
459
 
460
  [Q]: 世界上最高的山峰是什么?
461
  [A]: 世界上最高的山峰是珠穆朗玛峰,位于尼泊尔和中国的边境上,海拔8848米。
462
- [B]: 珠穆朗玛峰是世界上最高的山峰,位于喜马拉雅山脉,海拔8848米。
463
  [C]: 珠穆朗玛峰是世界上最高的山峰,海拔8848米。
464
  [D]: 世界上最高的山峰是珠穆朗玛峰,位于喜马拉雅山脉,海拔8,848.86米(29,031.69英尺)。
465
  [E]: 世界上最高的山峰是珠穆朗玛峰,也称为萨加玛塔,位于喜马拉雅山脉的尼泊尔和中国边境之间,海拔8,848.86米(29,031.69英尺)。它是世界上最著名的登山目标之一,吸引着来自世界各地的登山者前来挑战。珠穆朗玛峰的名字来自藏语,意思是“女神的额峰”,它是由两个不同的山峰组成的,其中一个是位于尼泊尔的珠穆朗玛峰,另一个则位于中国。它的高度是通过测量海拔高度来确定的,因为它是通过测量地球的平均高度来确定的。尽管珠穆朗玛峰是世界上最高的山峰,但它并不是地球上任何其他地方都能到达的。地球上的其他地方,如南美洲的安第斯山脉,也是世界上最高的山脉,但它并不是地球上最高的山峰。总之,珠穆朗玛峰是世界上最高的山峰,但它并不是地球上任何地方都能到达的。它是地球上最高的山脉,但它并不是地球上任何地方都能到达的。
@@ -467,7 +470,7 @@ MobileLLM提出架构的深度比宽度更重要,「深而窄」的「瘦长
467
 
468
  [Q]: 万有引力是谁提出的?
469
  [A]: 万有引力是由英国物理学家牛顿提出的。他认为,所有物体都会产生引力,这个引力是由物体的质量和距离决定的。这个理论揭示了质量和距离之间的关系,并证明了万有引力的本质。
470
- [B]: 万有引力是由牛顿发现的,它是由英国物理学家詹姆斯·瓦特于1879年提出的。
471
  [C]: 万有引力是英国物理学家艾萨克·牛顿在17世纪提出的。
472
  [D]: 引力是宇宙中一种力量,它是由质量或能量所组成的。根据牛顿的万有引力定律,一个物体之间的引力大小与它的质量成正比,与它的质量成反比。因此,牛顿的万有引力定律表明,一个物体之间的引力大小与它的质量成正比,与它的质量成反比。这个定律在物理学、天文学和物理学中都有广泛的应用。
473
  [E]: 万有引力是由英国物理学家艾萨克·牛顿于1687年提出的。他在1687年发表了一篇著名的论文,题为《自然哲学的数学原理》,其中提出了万有引力定律,指出任何两个物体之间都存在着一种引力,这种引力的大小与它们的质量成正比,与它们之间的距离的平方成反比。这个定律被称为万有引力定律,它描述了质量和距离之间的关系。它表明,两个物体之间的引力与它们的质量成正比,与它们之间的距离的平方成反比。这个定律对于解释宇宙中的许多现象和现象之间的相互作用非常重要,例如行星绕太阳运动、恒星之间的引力透镜效应等等。它也被广泛应用于天文学、物理学和工程学等领域,为我们理解宇宙的本质和演化提供了重要的理论基础。
@@ -475,7 +478,7 @@ MobileLLM提出架构的深度比宽度更重要,「深而窄」的「瘦长
475
 
476
  [Q]: 你知道大熊猫的主要食物是什么吗?
477
  [A]: 大熊猫的主要食物是竹子。它们通常吃竹子,每天需要吃掉大约30公斤的竹子。
478
- [B]: 大熊猫主要是竹子,它们的主要食物是竹子。大熊猫的消化系统比较特殊,它们每天需要消化大量的竹子才能维持其生存需要。除了竹子外,大熊猫还在野外生存。
479
  [C]: 大熊猫主要以竹子为食,也会偶尔食用其他的植物和动物。此外,大熊猫也会吃一些野果、小动物和植物,但这些都不属于它们的主要食物来源。
480
  [D]: 大熊猫的主要食物是竹子。
481
  [E]: 大熊猫是一种以竹子为主要食物的植食动物,它们的肠道结构适合消化纤维素含量高的竹子,但也会吃一些其他植物,如草本植物和小型哺乳动物。大熊猫的肠道结构也适应了高纤维的竹子,这使得它们能够消化大量的纤维素,从而帮助它们在野外生存。大熊猫的肠道中还含有一种叫做“纤维素酶”的酶类物质,这种酶能够帮助大熊猫分解纤维素,从而帮助它们消化大量的水分和营养物质。此外,大熊猫的肠道还含有一种叫做“纤维素蛋白”的蛋白质,这种蛋白质能够帮助大熊猫维持体内的能量水平,从而帮助它们在野外生存。总的来说,大熊猫的主要食物是竹子,但它们也会吃一些其他植物,如草本植物和小型哺乳动物。
@@ -483,7 +486,7 @@ MobileLLM提出架构的深度比宽度更重要,「深而窄」的「瘦长
483
 
484
  [Q]: 海水为什么是咸的?
485
  [A]: 海水中含有许多盐分,这些盐分可以导致海水咸度的增加。海水中的盐分主要来自于海水中的盐分,这些盐分可以从海水中的盐分中获得。海水中的氧化物和盐分可以与海水中的盐分形成氢键,从而使海水咸度增加。
486
- [B]: 海水中含有丰富的钙、磷、钾和磷等元素,这些元素在海洋中的数量和分布非常有限。此外,海水中还含有各种生物和微生物的营养物质,如贝壳和海藻等。
487
  [C]: 海水含有许多溶解的盐类,其中最主要的是氯化钠。当水中加入盐时,盐的分子会与水分子发生反应,形成氢氧化钠,这是一种盐。
488
  [D]: 因为水是许多生物的生存所必需的,它能够吸收和释放二氧化碳,释放氧气,维持大气中的水蒸气,以及维持水循环。水中的盐类和化学物质会释放出氧气,使地球上的生物呼吸。此外,水还具有许多其他功能,如保持水循环、维持水循环、调节气候和环境等。水是许多生物生存所必需的,它也是许多自然现象的重要基础。
489
  [E]: 海水是咸的主要原因是因为其中含有大量的溶解盐类,其中最主要的是氯化钠(NaCl)。氯化钠是一种非常常见的盐类,它存在于地球上的大部分地区,包括海洋、湖泊、河流和地下水。当水从海洋中蒸发时,它会带走一部分盐分,使海水变得咸。此外,海中还含有其他溶解盐类的物质,如硫酸盐、硫酸镁、硫酸钙、硫酸镁盐、硫酸镁钾、硫酸镁钠、硫酸镁钙、硫酸镁镁钙、硫酸镁镁钙、硫酸镁镁钙、硫酸镁镁钠、硫酸镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁镁钙、硫酸镁镁镁镁钙、硫酸镁镁镁镁钙、硫酸镁镁镁镁钙、硫酸镁镁镁镁钙、硫酸镁镁镁镁钙、硫酸镁镁镁镁镁钙、硫酸镁镁镁镁镁钙、硫酸镁镁镁镁镁钙、硫酸镁镁镁镁镁钙、硫酸镁镁镁镁镁镁钙、硫酸镁镁镁镁镁镁钙、硫酸镁镁镁镁镁镁钙、硫酸镁镁镁镁镁镁镁钙、硫酸镁镁镁镁
 
57
 
58
  | 模型 (大小) | tokenizer长度 | 推理占用 | release | 主观评分(/100) |
59
  |-------------------------|-------------|--------|------------|------------|
60
+ | minimind-v1-small (26M) | 6400 | 0.5 GB | 2024.08.28 | 50' |
61
  | minimind-v1-moe (4×26M) | 6400 | 1.0 GB | 2024.09.17 | 55' |
62
  | minimind-v1 (108M) | 6400 | 1.0 GB | 2024.09.01 | 60' |
63
 
 
320
  修改模型配置见[./model/LMConfig.py](./model/LMConfig.py)。
321
  minimind目前训练的模型版本见下表:
322
 
323
+ | Model Name | params | len_vocab | n_layers | d_model | kv_heads | q_heads | share+route | TopK |
324
+ |-------------------|--------|-----------|----------|---------|----------|---------|-------------|------|
325
  | minimind-v1-small | 26M | 6400 | 8 | 512 | 8 | 16 | - | - |
326
+ | minimind-v1-moe | 4×26M | 6400 | 8 | 512 | 8 | 16 | 2+4 | 2 |
327
+ | minimind-v1 | 108M | 6400 | 16 | 768 | 8 | 16 | - | - |
 
328
 
329
  # 📌 Experiment
330
 
 
335
  环境:python 3.9 + Torch 2.1.2 + DDP多卡训练
336
  ```
337
 
338
+ | Model Name | params | len_vocab | batch_size | pretrain_time | sft_single_time | sft_multi_time |
339
+ |-------------------|--------|-----------|------------|-------------------|-------------------|---------------------|
340
  | minimind-v1-small | 26M | 6400 | 64 | ≈2 hour (1 epoch) | ≈2 hour (1 epoch) | ≈0.5 hour (1 epoch) |
341
+ | minimind-v1-moe | 4×26M | 6400 | 40 | ≈6 hour (1 epoch) | ≈5 hour (1 epoch) | ≈1 hour (1 epoch) |
342
+ | minimind-v1 | 108M | 6400 | 16 | ≈6 hour (1 epoch) | ≈4 hour (1 epoch) | ≈1 hour (1 epoch) |
343
 
344
  ---
345
 
 
381
  ```bash
382
  python 5-dpo_train.py
383
  ```
384
+
385
  ---
386
 
387
  📋关于LLM的参数配置,有一篇很有意思的论文[MobileLLM](https://arxiv.org/pdf/2402.14905)做了详细的研究和实验。
 
410
  ![gpt3_config.png](./images/gpt3_config.png)
411
 
412
  ---
413
+
414
  ### 训练完成的模型权重
415
 
416
+ [百度网盘](https://pan.baidu.com/s/1KUfSzEkSXYbCCBj0Pw-9fA?pwd=6666)
 
 
 
 
417
 
418
+ | Model Name | params | Config | pretrain_model | single_sft_model | multi_sft_model |
419
+ |-------------------|--------|-----------------------------|----------------------------------------------------------------|----------------------------------------------------------------|----------------------------------------------------------------|
420
+ | minimind-v1-small | 26M | d_model=512<br/>n_layers=8 | [链接](https://pan.baidu.com/s/1wP_cAIc8cgaJ6CxUmR9ECQ?pwd=6666) | [链接](https://pan.baidu.com/s/1_COe0FQRDmeapSsvArahCA?pwd=6666) | [链接](https://pan.baidu.com/s/1GsGsWSL0Dckl0YPRXiBIFQ?pwd=6666) |
421
+ | minimind-v1-moe | 4×26M | d_model=512<br/>n_layers=8 | [链接](https://pan.baidu.com/s/1IZdkzPRhbZ_bSsRL8vInjg?pwd=6666) | [链接](https://pan.baidu.com/s/1tqB-GMvuiGQBvEl-yZ-oBw?pwd=6666) | [链接](https://pan.baidu.com/s/1GHJ2T4904EcT1u8l1rVqtg?pwd=6666) |
422
+ | minimind-v1 | 108M | d_model=768<br/>n_layers=16 | - | [链接](https://pan.baidu.com/s/1p713loS7EfwHQf3G9eYI3Q?pwd=6666) | [链接](https://pan.baidu.com/s/12iHGpAs6R0kqsOnGtgK6vQ?pwd=6666) |
423
 
424
+ ---
425
 
426
  # 📌 Eval
427
 
 
429
  > 以下测试于2024-09-17完成,此日期后发布的新模型,无特殊需要时将不加入测试。
430
 
431
  [A] [minimind-v1-small(0.02B)](https://pan.baidu.com/s/1_COe0FQRDmeapSsvArahCA?pwd=6666)<br/>
432
+ [B] [minimind-v1-moe(0.1B)](https://pan.baidu.com/s/1tqB-GMvuiGQBvEl-yZ-oBw?pwd=6666)<br/>
433
  [C] [minimind-v1(0.1B)](https://pan.baidu.com/s/1p713loS7EfwHQf3G9eYI3Q?pwd=6666)<br/>
434
  [D] [baby-llama2-chinese(0.2B)](https://github.com/DLLXW/baby-llama2-chinese)<br/>
435
  [E] [chatlm-mini-chinese(0.2B)](https://github.com/charent/ChatLM-mini-Chinese)<br/>
 
437
  ```text
438
  [Q]: 你知道长江吗?
439
  [A]: 是的,长江是中国最长的河流,全长超过3300千米。
440
+ [B]: 长江是中国最长的河流,总长度约为6300公里。它发源于青藏高原的青海湖,流经青海、四川、甘肃、宁夏、内蒙古、陕西和山西。长江流域是中国最大的农业区之一,也是中国的文化和经济中心。
441
  [C]: 是的,长江是中国最长的河流之一,也是世界第三大河流。长江发源于青藏高原,流经11个省份,最终注入东海。长江流域是中国最大的河流流域之一,拥有着丰富的自然资源和文化遗产,如长江大桥、长江海洋博物馆、长江古城等。
442
  [D]: 同时也知道,中国是世界上最长的城市之一,拥有超过3400万人口。
443
  [E]: 是的,长江是中国最长的河流,也是世界上最长的河流之一。它发源于青藏高原,流经九个省区,最终注入东海。长江流域的气候温和,四季分明,是中国重要的农业和工业基地。长江流域的水产资源丰富,是中国的重要渔业资源之一。
 
445
 
446
  [Q]: 第一颗人造卫星是哪个国家发射的?
447
  [A]: 第一颗人造卫星是斯普特尼克1号。
448
+ [B]: 第一颗人造卫星是俄罗斯的国际卫星,于1957年发射。
449
  [C]: 第一颗人造卫星是苏联发射的,它是由苏联于1957年10月发射的。
450
  [D]: 这颗名为“阿姆斯特朗”的卫星,于1961年11月20日发射,是第一个成功将人类送入太空的卫星。
451
  [E]: 第一颗人造卫星是苏联于1957年4月4日发射的,它是由苏联宇航员尤里·加加林(Yuri Gagarin)和苏联工程师罗尔德·谢尔盖耶夫(Roald Skohley)共同发射的。在发射过程中,加加林和谢尔盖耶夫使用了一个名为“斯普特尼克1号”的小型卫星,它的直径为4.3米,重量为1,051公斤。这枚卫星被称为“斯普特尼克1号”,是苏联太空竞赛中的一个里程碑,也是人类历史上第一次在太空中进行太空行走。
 
453
 
454
  [Q]: 你知道泰山在哪里吗?
455
  [A]: 泰山位于中国东部地区,是中国五岳之一。
456
+ [B]: 泰山位于山东省泰安市泰安市。泰安市是中国五岳之一,也是世界文化遗产和中国历史文化名山之一。泰安市有许多著名的景点,如五岳之一、玉皇顶、南天门、玉皇阁等。
457
  [C]: 泰山位于中国山东省泰安市,是中国五岳之一。
458
  [D]: 作为一个人工智能助手,我无法访问互联网,因此无法提供关于你所在位置的信息。不过,你可以通过搜索或搜索相关关键词来获取有关你所在位置的信息,或者查看相关新闻或网站来获取有关你所在位置的信息。
459
  [E]: 泰山位于中国山东省泰安市境内,是中国五岳之一,也是中国文化和历史���重要象征之一。它是中国五岳之一,也是中国道教和佛教的发源地之一。泰山的海拔高度为1,545米,是中国五岳之一,也是中国最著名的旅游景点之一。它的历史可以追溯到公元前5世纪,是中国古代帝王祭祀的场所。泰山也是中国文化的重要组成部分,被誉为“五岳之首”,是中国文化的重要遗产之一。
460
+ [F]: 泰山位于中国山东省中部,是中国的五岳之一,也是世界文化和自然遗产。它不仅以其雄伟的自然景观闻名,还承载着丰富的历史文化,是中国古代帝王封禅的重要场所。泰山是中华民族的象征之一,体现了中国人民对自然和历史的尊重与传承。
461
 
462
 
463
  [Q]: 世界上最高的山峰是什么?
464
  [A]: 世界上最高的山峰是珠穆朗玛峰,位于尼泊尔和中国的边境上,海拔8848米。
465
+ [B]: 世界上最高的山峰是珠穆朗玛峰,位于喜马拉雅山脉,海拔8,848米。
466
  [C]: 珠穆朗玛峰是世界上最高的山峰,海拔8848米。
467
  [D]: 世界上最高的山峰是珠穆朗玛峰,位于喜马拉雅山脉,海拔8,848.86米(29,031.69英尺)。
468
  [E]: 世界上最高的山峰是珠穆朗玛峰,也称为萨加玛塔,位于喜马拉雅山脉的尼泊尔和中国边境之间,海拔8,848.86米(29,031.69英尺)。它是世界上最著名的登山目标之一,吸引着来自世界各地的登山者前来挑战。珠穆朗玛峰的名字来自藏语,意思是“女神的额峰”,它是由两个不同的山峰组成的,其中一个是位于尼泊尔的珠穆朗玛峰,另一个则位于中国。它的高度是通过测量海拔高度来确定的,因为它是通过测量地球的平均高度来确定的。尽管珠穆朗玛峰是世界上最高的山峰,但它并不是地球上任何其他地方都能到达的。地球上的其他地方,如南美洲的安第斯山脉,也是世界上最高的山脉,但它并不是地球上最高的山峰。总之,珠穆朗玛峰是世界上最高的山峰,但它并不是地球上任何地方都能到达的。它是地球上最高的山脉,但它并不是地球上任何地方都能到达的。
 
470
 
471
  [Q]: 万有引力是谁提出的?
472
  [A]: 万有引力是由英国物理学家牛顿提出的。他认为,所有物体都会产生引力,这个引力是由物体的质量和距离决定的。这个理论揭示了质量和距离之间的关系,并证明了万有引力的本质。
473
+ [B]: 万有引力是由牛顿在1687年提出的。它是宇宙中一个基本的物理现象,它描述了物体之间的相互作用。这个力被称为万有引力,它在所有物体之间都存在,这种力与物体的质量和距离有关。万有引力是自然界中最基本的力量之一,它不仅影响着物体的运动和变化,还对宇宙的结构和演化产生着重要的影响。
474
  [C]: 万有引力是英国物理学家艾萨克·牛顿在17世纪提出的。
475
  [D]: 引力是宇宙中一种力量,它是由质量或能量所组成的。根据牛顿的万有引力定律,一个物体之间的引力大小与它的质量成正比,与它的质量成反比。因此,牛顿的万有引力定律表明,一个物体之间的引力大小与它的质量成正比,与它的质量成反比。这个定律在物理学、天文学和物理学中都有广泛的应用。
476
  [E]: 万有引力是由英国物理学家艾萨克·牛顿于1687年提出的。他在1687年发表了一篇著名的论文,题为《自然哲学的数学原理》,其中提出了万有引力定律,指出任何两个物体之间都存在着一种引力,这种引力的大小与它们的质量成正比,与它们之间的距离的平方成反比。这个定律被称为万有引力定律,它描述了质量和距离之间的关系。它表明,两个物体之间的引力与它们的质量成正比,与它们之间的距离的平方成反比。这个定律对于解释宇宙中的许多现象和现象之间的相互作用非常重要,例如行星绕太阳运动、恒星之间的引力透镜效应等等。它也被广泛应用于天文学、物理学和工程学等领域,为我们理解宇宙的本质和演化提供了重要的理论基础。
 
478
 
479
  [Q]: 你知道大熊猫的主要食物是什么吗?
480
  [A]: 大熊猫的主要食物是竹子。它们通常吃竹子,每天需要吃掉大约30公斤的竹子。
481
+ [B]: 是的,大熊猫主要以竹子为食。��们每天需要吃掉大量的竹子来维持生命,每天需要吃掉40至80公斤的竹子。此外,它们也会吃一些其他的植物,如草、果实、坚果和小型哺乳动物。
482
  [C]: 大熊猫主要以竹子为食,也会偶尔食用其他的植物和动物。此外,大熊猫也会吃一些野果、小动物和植物,但这些都不属于它们的主要食物来源。
483
  [D]: 大熊猫的主要食物是竹子。
484
  [E]: 大熊猫是一种以竹子为主要食物的植食动物,它们的肠道结构适合消化纤维素含量高的竹子,但也会吃一些其他植物,如草本植物和小型哺乳动物。大熊猫的肠道结构也适应了高纤维的竹子,这使得它们能够消化大量的纤维素,从而帮助它们在野外生存。大熊猫的肠道中还含有一种叫做“纤维素酶”的酶类物质,这种酶能够帮助大熊猫分解纤维素,从而帮助它们消化大量的水分和营养物质。此外,大熊猫的肠道还含有一种叫做“纤维素蛋白”的蛋白质,这种蛋白质能够帮助大熊猫维持体内的能量水平,从而帮助它们在野外生存。总的来说,大熊猫的主要食物是竹子,但它们也会吃一些其他植物,如草本植物和小型哺乳动物。
 
486
 
487
  [Q]: 海水为什么是咸的?
488
  [A]: 海水中含有许多盐分,这些盐分可以导致海水咸度的增加。海水中的盐分主要来自于海水中的盐分,这些盐分可以从海水中的盐分中获得。海水中的氧化物和盐分可以与海水中的盐分形成氢键,从而使海水咸度增加。
489
+ [B]: 海水是咸的,这是因为它含有大量的盐分,其中包括氯离子和氯离子。这些离子在海洋中形成了大量的盐,这些盐分被称为盐分。海水中含有许多溶解在水中的盐,这些盐分会随着海水的升高而逐渐蒸发,最终导致海水中盐分的蒸发。这些盐分在海水中被完全消耗,然后被排除在海水中。此外,海水中还含有许多其他的盐分,但这些盐分是不同的,因为它们在水中的浮起和下降过程是不同的。海水中的盐分是从海底的岩石中提取出来的,这些盐分在海水中可以被利用。总的来说,海水是咸的,这也是为什么海水中含有许多盐分和盐分的原因。
490
  [C]: 海水含有许多溶解的盐类,其中最主要的是氯化钠。当水中加入盐时,盐的分子会与水分子发生反应,形成氢氧化钠,这是一种盐。
491
  [D]: 因为水是许多生物的生存所必需的,它能够吸收和释放二氧化碳,释放氧气,维持大气中的水蒸气,以及维持水循环。水中的盐类和化学物质会释放出氧气,使地球上的生物呼吸。此外,水还具有许多其他功能,如保持水循环、维持水循环、调节气候和环境等。水是许多生物生存所必需的,它也是许多自然现象的重要基础。
492
  [E]: 海水是咸的主要原因是因为其中含有大量的溶解盐类,其中最主要的是氯化钠(NaCl)。氯化钠是一种非常常见的盐类,它存在于地球上的大部分地区,包括海洋、湖泊、河流和地下水。当水从海洋中蒸发时,它会带走一部分盐分,使海水变得咸。此外,海中还含有其他溶解盐类的物质,如硫酸盐、硫酸镁、硫酸钙、硫酸镁盐、硫酸镁钾、硫酸镁钠、硫酸镁钙、硫酸镁镁钙、硫酸镁镁钙、硫酸镁镁钙、硫酸镁镁钠、硫酸镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁钙、硫酸镁镁镁镁钙、硫酸镁镁镁镁钙、硫酸镁镁镁镁钙、硫酸镁镁镁镁钙、硫酸镁镁镁镁钙、硫酸镁镁镁镁钙、硫酸镁镁镁镁镁钙、硫酸镁镁镁镁镁钙、硫酸镁镁镁镁镁钙、硫酸镁镁镁镁镁钙、硫酸镁镁镁镁镁镁钙、硫酸镁镁镁镁镁镁钙、硫酸镁镁镁镁镁镁钙、硫酸镁镁镁镁镁镁镁钙、硫酸镁镁镁镁
README_en.md CHANGED
@@ -59,13 +59,14 @@ Therefore, the goal of this project is to lower the barrier to entry for working
59
  training an extremely lightweight language model from scratch.
60
 
61
  > [!CAUTION]
62
- > As of 2024-09-17, MiniMind has trained three model versions, with the smallest model requiring only 26M (0.02B) parameters to achieve smooth conversational abilities!
 
63
 
64
- | Model (Size) | Tokenizer Length | Inference Memory Usage | Release Date | Subjective Rating (/100) |
65
- |-------------------------------|------------------|------------------------|--------------|--------------------------|
66
- | minimind-v1-small (26M) | 6400 | 0.5 GB | 2024.08.28 | 50' |
67
- | minimind-v1-moe (4×26M) | 6400 | 1.0 GB | 2024.09.17 | 55' |
68
- | MiniMind-V1 (108M) | 6400 | 1.0 GB | 2024.09.01 | 60' |
69
 
70
  > This analysis was run on an RTX 3090 GPU with Torch 2.1.2, CUDA 12.2, and Flash Attention 2.
71
 
@@ -84,18 +85,21 @@ The project includes:
84
  We hope this open-source project helps LLM beginners get started quickly!
85
 
86
  ### 👉**Recent Updates**
 
87
  <details close>
88
  <summary> <b>2024-09-17 (new🎉)</b> </summary>
89
 
90
  - Updated the minimind-v1-moe model
91
- - To prevent ambiguity, all mistral_tokenizer versions have been removed, and a custom minimind_tokenizer is now used as the tokenizer.
 
92
 
93
  </details>
94
 
95
  <details close>
96
  <summary> <b>2024-09-01</b> </summary>
97
 
98
- - Updated the MiniMind-V1 (108M) model, using minimind_tokenizer with 3 pre-training epochs and 10 SFT epochs for more thorough training and improved performance.
 
99
 
100
  - The project has been deployed to ModelScope's Creative Space and can be experienced on the website:
101
 
@@ -167,6 +171,7 @@ The project has been deployed to ModelScope makerspace, where you can experience
167
 
168
  *
169
  0. Install the required dependencies
 
170
  ```bash
171
  pip install -r requirements.txt
172
  ```
@@ -196,7 +201,8 @@ git clone https://github.com/jingyaogong/minimind.git
196
  3. Test model inference performance
197
 
198
  * Ensure that the required trained parameter weights are located in the `./out/` directory.
199
- * You can also directly download and use the trained model weights from [Trained Model Weights](#Trained Model Weights).
 
200
  ```text
201
  out
202
  ├── multi_chat
@@ -261,10 +267,16 @@ git clone https://github.com/jingyaogong/minimind.git
261
  </table>
262
 
263
  > [!IMPORTANT]
264
- > Update on 2024-09-17: To avoid ambiguity from previous versions and control the model size, all Minimind models now use the Minimind_tokenizer for tokenization, and all versions of the Mistral_tokenizer have been deprecated.
 
265
 
266
- > Although the Minimind_tokenizer has a small length and its encoding/decoding efficiency is weaker compared to Chinese-friendly tokenizers like Qwen2 and GLM, the Minimind models have opted for their custom-trained Minimind_tokenizer to maintain a lightweight parameter structure and prevent an imbalance between encoding and computation layers. This is because the Minimind vocabulary size is only 6,400.
267
- > Moreover, Minimind has not encountered any issues with decoding rare words in practical tests, and the performance has been satisfactory. Due to the custom vocabulary being compressed to 6,400 tokens, the total parameter size of the LLM is minimized to only 26M.
 
 
 
 
 
268
 
269
  ---
270
 
@@ -346,12 +358,11 @@ and FFN layer code. The structure is illustrated in the figure below (redrawn):
346
  Model configurations can be found in [./model/LMConfig.py](./model/LMConfig.py). The model types and parameters are
347
  shown in the table below:
348
 
349
- | Model Name | params | len_vocab | n_layers | d_model | kv_heads | q_heads | share+route | TopK |
350
- |------------------|--------|-----------|----------|---------|----------|---------|-------------|------|
351
  | minimind-v1-small | 26M | 6400 | 8 | 512 | 8 | 16 | - | - |
352
- | minimind-v1-moe | 4×26M | 6400 | 8 | 512 | 8 | 16 | 2+4 | 2 |
353
- | minimind-v1 | 108M | 6400 | 16 | 768 | 8 | 16 | - | - |
354
-
355
 
356
  # 📌 Experiment
357
 
@@ -362,11 +373,11 @@ GPU: NVIDIA GeForce RTX 3090 (24GB) * 2
362
  Environment: python 3.9 + Torch 2.1.2 + DDP multi-GPU training
363
  ```
364
 
365
- | Model Name | params | len_vocab | batch_size | pretrain_time | sft_single_time | sft_multi_time |
366
- |------------------|--------|-----------|------------|-------------------|-------------------|---------------------|
367
  | minimind-v1-small | 26M | 6400 | 64 | ≈2 hour (1 epoch) | ≈2 hour (1 epoch) | ≈0.5 hour (1 epoch) |
368
- | minimind-v1-moe | 4×26M | 6400 | 40 | ≈6 hour (1 epoch) | ≈5 hour (1 epoch) | ≈1 hour (1 epoch) |
369
- | minimind-v1 | 108M | 6400 | 16 | ≈6 hour (1 epoch) | ≈4 hour (1 epoch) | ≈1 hour (1 epoch) |
370
 
371
  ---
372
 
@@ -428,43 +439,60 @@ Environment: python 3.9 + Torch 2.1.2 + DDP multi-GPU training
428
  ```bash
429
  python 5-dpo_train.py
430
  ```
 
431
  ---
432
- 📋 Regarding LLM parameter configuration, an interesting paper [MobileLLM](https://arxiv.org/pdf/2402.14905) provides detailed research and experiments.
433
- The scaling law exhibits unique patterns in small models. The parameters that significantly influence the scaling of Transformer models are primarily `d_model` and `n_layers`.
 
 
434
 
435
  * `d_model`↑ + `n_layers`↓ -> Short and wide models
436
  * `d_model`↓ + `n_layers`↑ -> Tall and narrow models
437
 
438
- The Scaling Law proposed in 2020 posits that the amount of training data, parameter count, and training iterations are the key factors determining performance, with the influence of model architecture being nearly negligible. However, this law seems not to fully apply to small models.
439
- MobileLLM suggests that the depth of the architecture is more important than its width. A "deep and narrow" model can learn more abstract concepts compared to a "wide and shallow" model. For instance, when the model parameters are fixed at 125M or 350M, a 30–42 layer "narrow" model significantly outperforms a 12-layer "short and wide" model. This trend is observed across eight benchmark tests, including common sense reasoning, question answering, and reading comprehension.
440
- This is a fascinating discovery, as previously, few attempts were made to stack more than 12 layers when designing architectures for small models around the 100M parameter range. This aligns with the observations from MiniMind, where adjusting parameters between `d_model` and `n_layers` during training produced similar effects.
441
- However, "deep and narrow" has its limitations. When `d_model` < 512, the disadvantages of collapsing word embedding dimensions become very pronounced, and increasing layers does not compensate for the shortcomings in `d_head` caused by fixed `q_head`. Conversely, when `d_model` > 1536, increasing layers seems to have a higher priority than `d_model`, providing a better "cost-performance" ratio and effect gain.
442
- Therefore, MiniMind sets `d_model = 512` and `n_layers = 8` for the small model to achieve a balance between "minimal size <-> better performance." For greater performance gains, `d_model = 768` and `n_layers = 16` are set, aligning better with the scaling law for small models.
 
 
 
 
 
 
 
 
 
 
 
 
443
 
444
  > For reference, the configuration details for GPT-3 are shown in the table below:
445
 
446
  ![gpt3_config.png](./images/gpt3_config.png)
447
 
448
  ---
 
449
  ### Trained Model Weights
450
 
 
451
 
452
- | Model Name | params | Config | pretrain_model | single_sft_model | multi_sft_model |
453
- |-------------------|--------|-----------------------------|----------------|-----------------------------------------------------------------|----------------------------------------------------------------|
454
- | minimind-v1-small | 26M | d_model=512<br/>n_layers=8 | - | [URL](https://pan.baidu.com/s/1_COe0FQRDmeapSsvArahCA?pwd=6666) | [URL](https://pan.baidu.com/s/1GsGsWSL0Dckl0YPRXiBIFQ?pwd=6666) |
455
- | minimind-v1-moe | 4×26M | d_model=512<br/>n_layers=8 | - | - | - |
456
- | minimind-v1 | 108M | d_model=768<br/>n_layers=16 | - | [URL](https://pan.baidu.com/s/1p713loS7EfwHQf3G9eYI3Q?pwd=6666) | [URL](https://pan.baidu.com/s/12iHGpAs6R0kqsOnGtgK6vQ?pwd=6666) |
457
 
458
  ---
459
 
460
-
461
  # 📌 Eval
462
 
463
  > [!TIP]
464
- > The following tests were completed on September 17, 2024. New models released after this date will not be included in the tests unless there is a special need.
 
465
 
466
  [A] [minimind-v1-small(0.02B)](https://pan.baidu.com/s/1_COe0FQRDmeapSsvArahCA?pwd=6666)<br/>
467
- [B] minimind-v1-moe(0.1B)<br/>
468
  [C] [minimind-v1(0.1B)](https://pan.baidu.com/s/1p713loS7EfwHQf3G9eYI3Q?pwd=6666)<br/>
469
  [D] [baby-llama2-chinese(0.2B)](https://github.com/DLLXW/baby-llama2-chinese)<br/>
470
  [E] [chatlm-mini-chinese(0.2B)](https://github.com/charent/ChatLM-mini-Chinese)<br/>
@@ -523,26 +551,33 @@ Therefore, MiniMind sets `d_model = 512` and `n_layers = 8` for the small model
523
  > 🙋‍♂️Directly throw the answer of the above model to GPT-4o and ask it to help score it:
524
 
525
  ---
 
526
  ### Model Performance Review:
527
 
528
  1. **Model A**:
529
- - **Performance**: Model A's responses are usually concise and clear but lack detail and accuracy in some cases. For example, Model A provided incorrect information about the length of the Yangtze River.
 
530
  - **Score**: 60
531
 
532
  2. **Model B**:
533
- - **Performance**: Model B provides additional information in some cases, but this information can sometimes be inaccurate or excessive. For instance, Model B gave incorrect figures for the length and drainage area of the Yangtze River.
 
 
534
  - **Score**: 65
535
 
536
  3. **Model C**:
537
- - **Performance**: Model C typically provides detailed and accurate answers for most questions. For example, responses about the Yangtze River and Mount Tai were accurate.
 
538
  - **Score**: 75
539
 
540
  4. **Model D**:
541
- - **Performance**: Model D’s responses sometimes appear disorganized and lack accuracy. For example, the answer about Mount Tai was completely off-topic.
 
542
  - **Score**: 50
543
 
544
  5. **Model E**:
545
- - **Performance**: Model E’s responses are usually very detailed, but they can be overly verbose and contain unnecessary information. For instance, the answer on gravity was overly complex.
 
546
  - **Score**: 70
547
 
548
  #### Ranking (from highest to lowest):
@@ -555,13 +590,21 @@ Therefore, MiniMind sets `d_model = 512` and `n_layers = 8` for the small model
555
 
556
  ## 👉 Summary of Effects
557
 
558
- * The ranking of the minimind series (ABC) is intuitive, with minimind-v1(0.1B) scoring the highest and providing mostly accurate answers to common knowledge questions.
 
559
  * Surprisingly, minimind-v1-small (0.02B) with only 26M parameters performs close to minimind-v1(0.1B).
560
- * Despite having less than 2 epochs of training, minimind-v1(0.1B) performed the best. This suggests that a larger model often yields better performance, even with limited training.
561
- * minimind-v1-moe (0.1B) performed poorly, likely because it was terminated early to free up resources for smaller models. MoE models require more training epochs, and with only 2 epochs, it was under-trained. Previous experiments with a fully trained MoE model on Yi tokenizer showed visible improvements. Future versions, v2 and v3, will be updated with better training.
562
-
563
- * Model E’s responses appear the most complete, despite some instances of hallucination and overly verbose content. However, GPT-4o and Deepseek's evaluations suggest it is "overly verbose and repetitive, with some hallucinations."
564
- This strict evaluation might penalize models with some hallucinations heavily. Due to F models having longer default text lengths and much larger datasets, the quality of responses depends significantly on the data rather than the model size alone.
 
 
 
 
 
 
 
565
 
566
  > 🙋‍♂️ Personal Subjective Evaluation: E>C>B≈A>D
567
 
@@ -604,7 +647,6 @@ answering, so results should be considered as reference only.
604
  | minimind-v1-small | 344 | 1346 | 25.56% |
605
  | minimind-v1 | 351 | 1346 | 26.08% |
606
 
607
-
608
  ### Model Performance Insights from GPT-4o
609
 
610
  ```text
@@ -708,10 +750,12 @@ your model with third-party UIs, such as fastgpt, OpenWebUI, etc.
708
  </a>
709
  -->
710
 
711
- <a href="https://github.com/jingyaogong"><img src="https://avatars.githubusercontent.com/u/62287848" width="70px" height="70px"/></a>&nbsp;
712
- <a href="https://github.com/MuWinds"><img src="https://avatars.githubusercontent.com/u/93832089" width="70px" height="70px"/></a>&nbsp;
713
- <a href="https://github.com/chuanzhubin"><img src="https://avatars.githubusercontent.com/u/2813798" width="70px" height="70px"/></a>&nbsp;
714
-
 
 
715
 
716
  ## 😊Thanks for
717
 
 
59
  training an extremely lightweight language model from scratch.
60
 
61
  > [!CAUTION]
62
+ > As of 2024-09-17, MiniMind has trained three model versions, with the smallest model requiring only 26M (0.02B)
63
+ > parameters to achieve smooth conversational abilities!
64
 
65
+ | Model (Size) | Tokenizer Length | Inference Memory Usage | Release Date | Subjective Rating (/100) |
66
+ |-------------------------|------------------|------------------------|--------------|--------------------------|
67
+ | minimind-v1-small (26M) | 6400 | 0.5 GB | 2024.08.28 | 50' |
68
+ | minimind-v1-moe (4×26M) | 6400 | 1.0 GB | 2024.09.17 | 55' |
69
+ | MiniMind-V1 (108M) | 6400 | 1.0 GB | 2024.09.01 | 60' |
70
 
71
  > This analysis was run on an RTX 3090 GPU with Torch 2.1.2, CUDA 12.2, and Flash Attention 2.
72
 
 
85
  We hope this open-source project helps LLM beginners get started quickly!
86
 
87
  ### 👉**Recent Updates**
88
+
89
  <details close>
90
  <summary> <b>2024-09-17 (new🎉)</b> </summary>
91
 
92
  - Updated the minimind-v1-moe model
93
+ - To prevent ambiguity, all mistral_tokenizer versions have been removed, and a custom minimind_tokenizer is now used as
94
+ the tokenizer.
95
 
96
  </details>
97
 
98
  <details close>
99
  <summary> <b>2024-09-01</b> </summary>
100
 
101
+ - Updated the MiniMind-V1 (108M) model, using minimind_tokenizer with 3 pre-training epochs and 10 SFT epochs for more
102
+ thorough training and improved performance.
103
 
104
  - The project has been deployed to ModelScope's Creative Space and can be experienced on the website:
105
 
 
171
 
172
  *
173
  0. Install the required dependencies
174
+
175
  ```bash
176
  pip install -r requirements.txt
177
  ```
 
201
  3. Test model inference performance
202
 
203
  * Ensure that the required trained parameter weights are located in the `./out/` directory.
204
+ * You can also directly download and use the trained model weights
205
+ from [Trained Model Weights](#Trained Model Weights).
206
  ```text
207
  out
208
  ├── multi_chat
 
267
  </table>
268
 
269
  > [!IMPORTANT]
270
+ > Update on 2024-09-17: To avoid ambiguity from previous versions and control the model size, all Minimind models now
271
+ use the Minimind_tokenizer for tokenization, and all versions of the Mistral_tokenizer have been deprecated.
272
 
273
+ > Although the Minimind_tokenizer has a small length and its encoding/decoding efficiency is weaker compared to
274
+ Chinese-friendly tokenizers like Qwen2 and GLM, the Minimind models have opted for their custom-trained
275
+ Minimind_tokenizer to maintain a lightweight parameter structure and prevent an imbalance between encoding and
276
+ computation layers. This is because the Minimind vocabulary size is only 6,400.
277
+ > Moreover, Minimind has not encountered any issues with decoding rare words in practical tests, and the performance
278
+ has been satisfactory. Due to the custom vocabulary being compressed to 6,400 tokens, the total parameter size of the
279
+ LLM is minimized to only 26M.
280
 
281
  ---
282
 
 
358
  Model configurations can be found in [./model/LMConfig.py](./model/LMConfig.py). The model types and parameters are
359
  shown in the table below:
360
 
361
+ | Model Name | params | len_vocab | n_layers | d_model | kv_heads | q_heads | share+route | TopK |
362
+ |-------------------|--------|-----------|----------|---------|----------|---------|-------------|------|
363
  | minimind-v1-small | 26M | 6400 | 8 | 512 | 8 | 16 | - | - |
364
+ | minimind-v1-moe | 4×26M | 6400 | 8 | 512 | 8 | 16 | 2+4 | 2 |
365
+ | minimind-v1 | 108M | 6400 | 16 | 768 | 8 | 16 | - | - |
 
366
 
367
  # 📌 Experiment
368
 
 
373
  Environment: python 3.9 + Torch 2.1.2 + DDP multi-GPU training
374
  ```
375
 
376
+ | Model Name | params | len_vocab | batch_size | pretrain_time | sft_single_time | sft_multi_time |
377
+ |-------------------|--------|-----------|------------|-------------------|-------------------|---------------------|
378
  | minimind-v1-small | 26M | 6400 | 64 | ≈2 hour (1 epoch) | ≈2 hour (1 epoch) | ≈0.5 hour (1 epoch) |
379
+ | minimind-v1-moe | 4×26M | 6400 | 40 | ≈6 hour (1 epoch) | ≈5 hour (1 epoch) | ≈1 hour (1 epoch) |
380
+ | minimind-v1 | 108M | 6400 | 16 | ≈6 hour (1 epoch) | ≈4 hour (1 epoch) | ≈1 hour (1 epoch) |
381
 
382
  ---
383
 
 
439
  ```bash
440
  python 5-dpo_train.py
441
  ```
442
+
443
  ---
444
+ 📋 Regarding LLM parameter configuration, an interesting paper [MobileLLM](https://arxiv.org/pdf/2402.14905) provides
445
+ detailed research and experiments.
446
+ The scaling law exhibits unique patterns in small models. The parameters that significantly influence the scaling of
447
+ Transformer models are primarily `d_model` and `n_layers`.
448
 
449
  * `d_model`↑ + `n_layers`↓ -> Short and wide models
450
  * `d_model`↓ + `n_layers`↑ -> Tall and narrow models
451
 
452
+ The Scaling Law proposed in 2020 posits that the amount of training data, parameter count, and training iterations are
453
+ the key factors determining performance, with the influence of model architecture being nearly negligible. However, this
454
+ law seems not to fully apply to small models.
455
+ MobileLLM suggests that the depth of the architecture is more important than its width. A "deep and narrow" model can
456
+ learn more abstract concepts compared to a "wide and shallow" model. For instance, when the model parameters are fixed
457
+ at 125M or 350M, a 30–42 layer "narrow" model significantly outperforms a 12-layer "short and wide" model. This trend is
458
+ observed across eight benchmark tests, including common sense reasoning, question answering, and reading comprehension.
459
+ This is a fascinating discovery, as previously, few attempts were made to stack more than 12 layers when designing
460
+ architectures for small models around the 100M parameter range. This aligns with the observations from MiniMind, where
461
+ adjusting parameters between `d_model` and `n_layers` during training produced similar effects.
462
+ However, "deep and narrow" has its limitations. When `d_model` < 512, the disadvantages of collapsing word embedding
463
+ dimensions become very pronounced, and increasing layers does not compensate for the shortcomings in `d_head` caused by
464
+ fixed `q_head`. Conversely, when `d_model` > 1536, increasing layers seems to have a higher priority than `d_model`,
465
+ providing a better "cost-performance" ratio and effect gain.
466
+ Therefore, MiniMind sets `d_model = 512` and `n_layers = 8` for the small model to achieve a balance between "minimal
467
+ size <-> better performance." For greater performance gains, `d_model = 768` and `n_layers = 16` are set, aligning
468
+ better with the scaling law for small models.
469
 
470
  > For reference, the configuration details for GPT-3 are shown in the table below:
471
 
472
  ![gpt3_config.png](./images/gpt3_config.png)
473
 
474
  ---
475
+
476
  ### Trained Model Weights
477
 
478
+ [baidu](https://pan.baidu.com/s/1KUfSzEkSXYbCCBj0Pw-9fA?pwd=6666)
479
 
480
+ | Model Name | params | Config | pretrain_model | single_sft_model | multi_sft_model |
481
+ |-------------------|--------|-----------------------------|-----------------------------------------------------------------|----------------------------------------------------------------|----------------------------------------------------------------|
482
+ | minimind-v1-small | 26M | d_model=512<br/>n_layers=8 | [URL](https://pan.baidu.com/s/1wP_cAIc8cgaJ6CxUmR9ECQ?pwd=6666) | [URL](https://pan.baidu.com/s/1_COe0FQRDmeapSsvArahCA?pwd=6666) | [URL](https://pan.baidu.com/s/1GsGsWSL0Dckl0YPRXiBIFQ?pwd=6666) |
483
+ | minimind-v1-moe | 4×26M | d_model=512<br/>n_layers=8 | [URL](https://pan.baidu.com/s/1IZdkzPRhbZ_bSsRL8vInjg?pwd=6666) | [URL](https://pan.baidu.com/s/1tqB-GMvuiGQBvEl-yZ-oBw?pwd=6666) | [URL](https://pan.baidu.com/s/1GHJ2T4904EcT1u8l1rVqtg?pwd=6666) |
484
+ | minimind-v1 | 108M | d_model=768<br/>n_layers=16 | - | [URL](https://pan.baidu.com/s/1p713loS7EfwHQf3G9eYI3Q?pwd=6666) | [URL](https://pan.baidu.com/s/12iHGpAs6R0kqsOnGtgK6vQ?pwd=6666) |
485
 
486
  ---
487
 
 
488
  # 📌 Eval
489
 
490
  > [!TIP]
491
+ > The following tests were completed on September 17, 2024. New models released after this date will not be included in
492
+ > the tests unless there is a special need.
493
 
494
  [A] [minimind-v1-small(0.02B)](https://pan.baidu.com/s/1_COe0FQRDmeapSsvArahCA?pwd=6666)<br/>
495
+ [B] [minimind-v1-moe(0.1B)](https://pan.baidu.com/s/1tqB-GMvuiGQBvEl-yZ-oBw?pwd=6666)<br/>
496
  [C] [minimind-v1(0.1B)](https://pan.baidu.com/s/1p713loS7EfwHQf3G9eYI3Q?pwd=6666)<br/>
497
  [D] [baby-llama2-chinese(0.2B)](https://github.com/DLLXW/baby-llama2-chinese)<br/>
498
  [E] [chatlm-mini-chinese(0.2B)](https://github.com/charent/ChatLM-mini-Chinese)<br/>
 
551
  > 🙋‍♂️Directly throw the answer of the above model to GPT-4o and ask it to help score it:
552
 
553
  ---
554
+
555
  ### Model Performance Review:
556
 
557
  1. **Model A**:
558
+ - **Performance**: Model A's responses are usually concise and clear but lack detail and accuracy in some cases. For
559
+ example, Model A provided incorrect information about the length of the Yangtze River.
560
  - **Score**: 60
561
 
562
  2. **Model B**:
563
+ - **Performance**: Model B provides additional information in some cases, but this information can sometimes be
564
+ inaccurate or excessive. For instance, Model B gave incorrect figures for the length and drainage area of the
565
+ Yangtze River.
566
  - **Score**: 65
567
 
568
  3. **Model C**:
569
+ - **Performance**: Model C typically provides detailed and accurate answers for most questions. For example,
570
+ responses about the Yangtze River and Mount Tai were accurate.
571
  - **Score**: 75
572
 
573
  4. **Model D**:
574
+ - **Performance**: Model D’s responses sometimes appear disorganized and lack accuracy. For example, the answer
575
+ about Mount Tai was completely off-topic.
576
  - **Score**: 50
577
 
578
  5. **Model E**:
579
+ - **Performance**: Model E’s responses are usually very detailed, but they can be overly verbose and contain
580
+ unnecessary information. For instance, the answer on gravity was overly complex.
581
  - **Score**: 70
582
 
583
  #### Ranking (from highest to lowest):
 
590
 
591
  ## 👉 Summary of Effects
592
 
593
+ * The ranking of the minimind series (ABC) is intuitive, with minimind-v1(0.1B) scoring the highest and providing mostly
594
+ accurate answers to common knowledge questions.
595
  * Surprisingly, minimind-v1-small (0.02B) with only 26M parameters performs close to minimind-v1(0.1B).
596
+ * Despite having less than 2 epochs of training, minimind-v1(0.1B) performed the best. This suggests that a larger
597
+ model often yields better performance, even with limited training.
598
+ * minimind-v1-moe (0.1B) performed poorly, likely because it was terminated early to free up resources for smaller
599
+ models. MoE models require more training epochs, and with only 2 epochs, it was under-trained. Previous
600
+ experiments with a fully trained MoE model on Yi tokenizer showed visible improvements. Future versions, v2 and
601
+ v3, will be updated with better training.
602
+
603
+ * Model E’s responses appear the most complete, despite some instances of hallucination and overly verbose content.
604
+ However, GPT-4o and Deepseek's evaluations suggest it is "overly verbose and repetitive, with some hallucinations."
605
+ This strict evaluation might penalize models with some hallucinations heavily. Due to F models having longer default
606
+ text lengths and much larger datasets, the quality of responses depends significantly on the data rather than the
607
+ model size alone.
608
 
609
  > 🙋‍♂️ Personal Subjective Evaluation: E>C>B≈A>D
610
 
 
647
  | minimind-v1-small | 344 | 1346 | 25.56% |
648
  | minimind-v1 | 351 | 1346 | 26.08% |
649
 
 
650
  ### Model Performance Insights from GPT-4o
651
 
652
  ```text
 
750
  </a>
751
  -->
752
 
753
+ <a href="https://github.com/jingyaogong"><img src="https://avatars.githubusercontent.com/u/62287848" width="70px" height="70px"/></a>
754
+ &nbsp;
755
+ <a href="https://github.com/MuWinds"><img src="https://avatars.githubusercontent.com/u/93832089" width="70px" height="70px"/></a>
756
+ &nbsp;
757
+ <a href="https://github.com/chuanzhubin"><img src="https://avatars.githubusercontent.com/u/2813798" width="70px" height="70px"/></a>
758
+ &nbsp;
759
 
760
  ## 😊Thanks for
761
 
config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Transformer"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "LMConfig.LMConfig",
7
+ "AutoModelForCausalLM": "model.Transformer"
8
+ },
9
+ "aux_loss_alpha": 0.01,
10
+ "dim": 512,
11
+ "dropout": 0.0,
12
+ "flash_attn": true,
13
+ "hidden_dim": null,
14
+ "max_seq_len": 512,
15
+ "model_type": "minimind",
16
+ "multiple_of": 64,
17
+ "n_heads": 16,
18
+ "n_kv_heads": 8,
19
+ "n_layers": 8,
20
+ "n_routed_experts": 4,
21
+ "n_shared_experts": true,
22
+ "norm_eps": 1e-05,
23
+ "norm_topk_prob": true,
24
+ "num_experts_per_tok": 2,
25
+ "scoring_func": "softmax",
26
+ "seq_aux": true,
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.44.0",
29
+ "use_moe": true,
30
+ "vocab_size": 6400
31
+ }
generation_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "transformers_version": "4.44.0"
4
+ }
model.py ADDED
@@ -0,0 +1,420 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import struct
3
+ import inspect
4
+ from .LMConfig import LMConfig
5
+ from typing import Any, Optional, Tuple
6
+ import numpy as np
7
+ import torch
8
+ import torch.nn.functional as F
9
+ from torch import nn
10
+ from transformers import PreTrainedModel
11
+ from transformers.modeling_outputs import CausalLMOutputWithPast
12
+
13
+
14
+ class RMSNorm(torch.nn.Module):
15
+ def __init__(self, dim: int, eps: float):
16
+ super().__init__()
17
+ self.eps = eps
18
+ self.weight = nn.Parameter(torch.ones(dim))
19
+
20
+ def _norm(self, x):
21
+ return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
22
+
23
+ def forward(self, x):
24
+ output = self._norm(x.float()).type_as(x)
25
+ return output * self.weight
26
+
27
+
28
+ def precompute_pos_cis(dim: int, end: int, theta: float = 10000.0):
29
+ freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
30
+ t = torch.arange(end, device=freqs.device) # type: ignore
31
+ freqs = torch.outer(t, freqs).float() # type: ignore
32
+ pos_cis = torch.polar(torch.ones_like(freqs), freqs) # complex64
33
+ return pos_cis
34
+
35
+
36
+ def apply_rotary_emb(xq, xk, pos_cis):
37
+ def unite_shape(pos_cis, x):
38
+ ndim = x.ndim
39
+ assert 0 <= 1 < ndim
40
+ assert pos_cis.shape == (x.shape[1], x.shape[-1])
41
+ shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
42
+ return pos_cis.view(*shape)
43
+
44
+ xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
45
+ xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
46
+ pos_cis = unite_shape(pos_cis, xq_)
47
+ xq_out = torch.view_as_real(xq_ * pos_cis).flatten(3)
48
+ xk_out = torch.view_as_real(xk_ * pos_cis).flatten(3)
49
+ return xq_out.type_as(xq), xk_out.type_as(xk)
50
+
51
+
52
+ def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
53
+ """torch.repeat_interleave(x, dim=2, repeats=n_rep)"""
54
+ bs, slen, n_kv_heads, head_dim = x.shape
55
+ if n_rep == 1:
56
+ return x
57
+ return (
58
+ x[:, :, :, None, :]
59
+ .expand(bs, slen, n_kv_heads, n_rep, head_dim)
60
+ .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
61
+ )
62
+
63
+
64
+ class Attention(nn.Module):
65
+ def __init__(self, args: LMConfig):
66
+ super().__init__()
67
+ self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads
68
+ assert args.n_heads % self.n_kv_heads == 0
69
+ self.n_local_heads = args.n_heads
70
+ self.n_local_kv_heads = self.n_kv_heads
71
+ self.n_rep = self.n_local_heads // self.n_local_kv_heads
72
+ self.head_dim = args.dim // args.n_heads
73
+ self.wq = nn.Linear(args.dim, args.n_heads * self.head_dim, bias=False)
74
+ self.wk = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
75
+ self.wv = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
76
+ self.wo = nn.Linear(args.n_heads * self.head_dim, args.dim, bias=False)
77
+ self.k_cache, self.v_cache = None, None
78
+ self.attn_dropout = nn.Dropout(args.dropout)
79
+ self.resid_dropout = nn.Dropout(args.dropout)
80
+ self.dropout = args.dropout
81
+ self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention') and args.flash_attn
82
+
83
+ if not self.flash:
84
+ # print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
85
+ mask = torch.full((1, 1, args.max_seq_len, args.max_seq_len), float("-inf"))
86
+ mask = torch.triu(mask, diagonal=1)
87
+ self.register_buffer("mask", mask)
88
+
89
+ def forward(self, x: torch.Tensor, pos_cis: torch.Tensor, use_kv_cache=False):
90
+ bsz, seqlen, _ = x.shape
91
+ if use_kv_cache and self.eval():
92
+ if self.k_cache is None or self.k_cache.shape[1] != x.shape[1] - 1:
93
+ xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
94
+ else:
95
+ token = x[:, -1:, :]
96
+ xq = torch.cat((torch.zeros_like(x[:, :-1, :]), self.wq(token)), dim=1)
97
+ xk = torch.cat((self.k_cache, self.wk(token)), dim=1)
98
+ xv = torch.cat((self.v_cache, self.wv(token)), dim=1)
99
+
100
+ self.k_cache, self.v_cache = xk, xv
101
+ else:
102
+ xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
103
+
104
+ xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
105
+ xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
106
+ xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
107
+
108
+ xq, xk = apply_rotary_emb(xq, xk, pos_cis)
109
+
110
+ xk = repeat_kv(xk, self.n_rep) # (bs, seqlen, n_local_heads, head_dim)
111
+ xv = repeat_kv(xv, self.n_rep) # (bs, seqlen, n_local_heads, head_dim)
112
+
113
+ xq = xq.transpose(1, 2)
114
+ xk = xk.transpose(1, 2)
115
+ xv = xv.transpose(1, 2)
116
+
117
+ if self.flash:
118
+ output = torch.nn.functional.scaled_dot_product_attention(xq, xk, xv, attn_mask=None,
119
+ dropout_p=self.dropout if self.training else 0.0,
120
+ is_causal=True)
121
+ else:
122
+ scores = torch.matmul(xq, xk.transpose(2, 3)) / math.sqrt(self.head_dim)
123
+ assert hasattr(self, 'mask')
124
+ scores = scores + self.mask[:, :, :seqlen, :seqlen] # (bs, n_local_heads, seqlen, cache_len + seqlen)
125
+ scores = F.softmax(scores.float(), dim=-1).type_as(xq)
126
+ scores = self.attn_dropout(scores)
127
+ output = torch.matmul(scores, xv) # (bs, n_local_heads, seqlen, head_dim)
128
+
129
+ output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
130
+
131
+ output = self.wo(output)
132
+ output = self.resid_dropout(output)
133
+ return output
134
+
135
+
136
+ class FeedForward(nn.Module):
137
+ def __init__(self, dim: int, hidden_dim: int, multiple_of: int, dropout: float):
138
+ super().__init__()
139
+ if hidden_dim is None:
140
+ hidden_dim = 4 * dim
141
+ hidden_dim = int(2 * hidden_dim / 3)
142
+ hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
143
+ self.w1 = nn.Linear(dim, hidden_dim, bias=False)
144
+ self.w2 = nn.Linear(hidden_dim, dim, bias=False)
145
+ self.w3 = nn.Linear(dim, hidden_dim, bias=False)
146
+ self.dropout = nn.Dropout(dropout)
147
+
148
+ def forward(self, x):
149
+ return self.dropout(self.w2(F.silu(self.w1(x)) * self.w3(x)))
150
+
151
+
152
+ class MoEGate(nn.Module):
153
+ def __init__(self, config: LMConfig):
154
+ super().__init__()
155
+ self.config = config
156
+ self.top_k = config.num_experts_per_tok
157
+ self.n_routed_experts = config.n_routed_experts
158
+
159
+ self.scoring_func = config.scoring_func
160
+ self.alpha = config.aux_loss_alpha
161
+ self.seq_aux = config.seq_aux
162
+
163
+ self.norm_topk_prob = config.norm_topk_prob
164
+ self.gating_dim = config.dim
165
+ self.weight = nn.Parameter(torch.empty((self.n_routed_experts, self.gating_dim)))
166
+ self.reset_parameters()
167
+
168
+ def reset_parameters(self) -> None:
169
+ import torch.nn.init as init
170
+ init.kaiming_uniform_(self.weight, a=math.sqrt(5))
171
+
172
+ def forward(self, hidden_states):
173
+ bsz, seq_len, h = hidden_states.shape
174
+
175
+ hidden_states = hidden_states.view(-1, h)
176
+ logits = F.linear(hidden_states, self.weight, None)
177
+ if self.scoring_func == 'softmax':
178
+ scores = logits.softmax(dim=-1)
179
+ else:
180
+ raise NotImplementedError(f'insupportable scoring function for MoE gating: {self.scoring_func}')
181
+
182
+ topk_weight, topk_idx = torch.topk(scores, k=self.top_k, dim=-1, sorted=False)
183
+
184
+ if self.top_k > 1 and self.norm_topk_prob:
185
+ denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
186
+ topk_weight = topk_weight / denominator
187
+
188
+ if self.training and self.alpha > 0.0:
189
+ scores_for_aux = scores
190
+ aux_topk = self.top_k
191
+ topk_idx_for_aux_loss = topk_idx.view(bsz, -1)
192
+ if self.seq_aux:
193
+ scores_for_seq_aux = scores_for_aux.view(bsz, seq_len, -1)
194
+ ce = torch.zeros(bsz, self.n_routed_experts, device=hidden_states.device)
195
+ ce.scatter_add_(1, topk_idx_for_aux_loss,
196
+ torch.ones(bsz, seq_len * aux_topk, device=hidden_states.device)).div_(
197
+ seq_len * aux_topk / self.n_routed_experts)
198
+ aux_loss = (ce * scores_for_seq_aux.mean(dim=1)).sum(dim=1).mean() * self.alpha
199
+ else:
200
+ mask_ce = F.one_hot(topk_idx_for_aux_loss.view(-1), num_classes=self.n_routed_experts)
201
+ ce = mask_ce.float().mean(0)
202
+ Pi = scores_for_aux.mean(0)
203
+ fi = ce * self.n_routed_experts
204
+ aux_loss = (Pi * fi).sum() * self.alpha
205
+ else:
206
+ aux_loss = None
207
+ return topk_idx, topk_weight, aux_loss
208
+
209
+
210
+ class MOEFeedForward(nn.Module):
211
+ def __init__(self, config: LMConfig):
212
+ super().__init__()
213
+ self.config = config
214
+ self.experts = nn.ModuleList([
215
+ FeedForward(
216
+ dim=config.dim,
217
+ hidden_dim=config.hidden_dim,
218
+ multiple_of=config.multiple_of,
219
+ dropout=config.dropout,
220
+ )
221
+ for _ in range(config.n_routed_experts)
222
+ ])
223
+
224
+ self.gate = MoEGate(config)
225
+ if config.n_shared_experts is not None:
226
+ self.shared_experts = FeedForward(
227
+ dim=config.dim,
228
+ hidden_dim=config.hidden_dim,
229
+ multiple_of=config.multiple_of,
230
+ dropout=config.dropout,
231
+ )
232
+
233
+ def forward(self, x):
234
+ identity = x
235
+ orig_shape = x.shape
236
+ bsz, seq_len, _ = x.shape
237
+
238
+ # 使用门控机制选择专家
239
+ topk_idx, topk_weight, aux_loss = self.gate(x)
240
+
241
+ x = x.view(-1, x.shape[-1])
242
+ flat_topk_idx = topk_idx.view(-1)
243
+
244
+ if self.training:
245
+ # 训练模式下,重复输入数据
246
+ x = x.repeat_interleave(self.config.num_experts_per_tok, dim=0)
247
+ y = torch.empty_like(x, dtype=torch.float16)
248
+ for i, expert in enumerate(self.experts):
249
+ y[flat_topk_idx == i] = expert(x[flat_topk_idx == i])
250
+ y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
251
+ y = y.view(*orig_shape)
252
+ else:
253
+ # 推理模式下,只选择最优专家
254
+ y = self.moe_infer(x, flat_topk_idx, topk_weight.view(-1, 1)).view(*orig_shape)
255
+
256
+ if self.config.n_shared_experts is not None:
257
+ y = y + self.shared_experts(identity)
258
+
259
+ return y
260
+
261
+ @torch.no_grad()
262
+ def moe_infer(self, x, flat_expert_indices, flat_expert_weights):
263
+ expert_cache = torch.zeros_like(x)
264
+ idxs = flat_expert_indices.argsort()
265
+ tokens_per_expert = flat_expert_indices.bincount().cpu().numpy().cumsum(0)
266
+ token_idxs = idxs // self.config.num_experts_per_tok
267
+ # 例如当tokens_per_expert=[6, 15, 20, 26, 33, 38, 46, 52]
268
+ # 当token_idxs=[3, 7, 19, 21, 24, 25, 4, 5, 6, 10, 11, 12...]
269
+ # 意味着当token_idxs[:6] -> [3, 7, 19, 21, 24, 25, 4]位置的token都由专家0处理,token_idxs[6:15]位置的token都由专家1处理......
270
+ for i, end_idx in enumerate(tokens_per_expert):
271
+ start_idx = 0 if i == 0 else tokens_per_expert[i - 1]
272
+ if start_idx == end_idx:
273
+ continue
274
+ expert = self.experts[i]
275
+ exp_token_idx = token_idxs[start_idx:end_idx]
276
+ expert_tokens = x[exp_token_idx]
277
+ expert_out = expert(expert_tokens)
278
+ expert_out.mul_(flat_expert_weights[idxs[start_idx:end_idx]])
279
+ # 使用 scatter_add_ 进行 sum 操作
280
+ expert_cache.scatter_add_(0, exp_token_idx.view(-1, 1).repeat(1, x.shape[-1]), expert_out)
281
+
282
+ return expert_cache
283
+
284
+
285
+ class TransformerBlock(nn.Module):
286
+ def __init__(self, layer_id: int, args: LMConfig):
287
+ super().__init__()
288
+ self.n_heads = args.n_heads
289
+ self.dim = args.dim
290
+ self.head_dim = args.dim // args.n_heads
291
+ self.attention = Attention(args)
292
+
293
+ self.layer_id = layer_id
294
+ self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
295
+ self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
296
+
297
+ if args.use_moe:
298
+ self.feed_forward = MOEFeedForward(args)
299
+ else:
300
+ self.feed_forward = FeedForward(
301
+ dim=args.dim,
302
+ hidden_dim=args.hidden_dim,
303
+ multiple_of=args.multiple_of,
304
+ dropout=args.dropout,
305
+ )
306
+
307
+ def forward(self, x, pos_cis, use_kv_cache=False):
308
+ h = x + self.attention(self.attention_norm(x), pos_cis, use_kv_cache)
309
+ out = h + self.feed_forward(self.ffn_norm(h))
310
+ return out
311
+
312
+
313
+ class Transformer(PreTrainedModel):
314
+ config_class = LMConfig
315
+ last_loss: Optional[torch.Tensor]
316
+
317
+ def __init__(self, params: LMConfig = None):
318
+ super().__init__(params)
319
+ if not params:
320
+ params = LMConfig()
321
+ self.params = params
322
+ self.vocab_size = params.vocab_size
323
+ self.n_layers = params.n_layers
324
+
325
+ self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim)
326
+ self.dropout = nn.Dropout(params.dropout)
327
+ self.layers = torch.nn.ModuleList()
328
+ for layer_id in range(self.n_layers):
329
+ self.layers.append(TransformerBlock(layer_id, params))
330
+ self.norm = RMSNorm(params.dim, eps=params.norm_eps)
331
+ self.output = nn.Linear(params.dim, params.vocab_size, bias=False)
332
+ self.tok_embeddings.weight = self.output.weight
333
+ pos_cis = precompute_pos_cis(self.params.dim // self.params.n_heads, self.params.max_seq_len)
334
+ self.register_buffer("pos_cis", pos_cis, persistent=False)
335
+
336
+ self.apply(self._init_weights)
337
+
338
+ for pn, p in self.named_parameters():
339
+ if pn.endswith('w3.weight') or pn.endswith('wo.weight'):
340
+ torch.nn.init.normal_(p, mean=0.0, std=0.02 / math.sqrt(2 * params.n_layers))
341
+
342
+ self.last_loss = None
343
+ self.OUT = CausalLMOutputWithPast()
344
+
345
+ def _init_weights(self, module):
346
+ if isinstance(module, nn.Linear):
347
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
348
+ if module.bias is not None:
349
+ torch.nn.init.zeros_(module.bias)
350
+ elif isinstance(module, nn.Embedding):
351
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
352
+
353
+ def forward(self, tokens: Optional[torch.Tensor] = None, targets: Optional[torch.Tensor] = None,
354
+ use_kv_cache=False, **keyargs):
355
+ if 'input_ids' in keyargs:
356
+ tokens = keyargs['input_ids']
357
+ if 'attention_mask' in keyargs:
358
+ targets = keyargs['attention_mask']
359
+
360
+ _bsz, seqlen = tokens.shape
361
+ h = self.tok_embeddings(tokens)
362
+ h = self.dropout(h)
363
+ pos_cis = self.pos_cis[:seqlen]
364
+ for idx, layer in enumerate(self.layers):
365
+ h = layer(h, pos_cis, use_kv_cache)
366
+
367
+ h = self.norm(h)
368
+
369
+ if targets is not None:
370
+ logits = self.output(h)
371
+ self.last_loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
372
+ else:
373
+ logits = self.output(h[:, [-1], :])
374
+ self.last_loss = None
375
+
376
+ self.OUT.__setitem__('logits', logits)
377
+ self.OUT.__setitem__('last_loss', self.last_loss)
378
+
379
+ return self.OUT
380
+
381
+ @torch.inference_mode()
382
+ def generate(self, idx, eos, max_new_tokens, temperature=0.7, top_k=None, stream=True, repetition_penalty=1.,
383
+ use_kv_cache=True):
384
+ index = idx.shape[1]
385
+ while idx.shape[1] < max_new_tokens - 1:
386
+ inference_res = self(idx, use_kv_cache=use_kv_cache)
387
+ logits = inference_res.logits
388
+ logits = logits[:, -1, :]
389
+
390
+ for token in set(idx.tolist()[0]):
391
+ logits[:, token] /= repetition_penalty
392
+
393
+ if temperature == 0.0:
394
+ _, idx_next = torch.topk(logits, k=1, dim=-1)
395
+ else:
396
+ logits = logits / temperature
397
+ if top_k is not None:
398
+ v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
399
+ logits[logits < v[:, [-1]]] = -float('Inf')
400
+
401
+ probs = F.softmax(logits, dim=-1)
402
+ idx_next = torch.multinomial(probs, num_samples=1, generator=None)
403
+
404
+ if idx_next == eos:
405
+ break
406
+
407
+ idx = torch.cat((idx, idx_next), dim=1)
408
+ if stream:
409
+ yield idx[:, index:]
410
+
411
+ if not stream:
412
+ yield idx[:, index:]
413
+
414
+ @torch.inference_mode()
415
+ def eval_answer(self, idx):
416
+ idx_cond = idx if idx.size(1) <= self.params.max_seq_len else idx[:, -self.params.max_seq_len:]
417
+ inference_res = self(idx_cond)
418
+ logits = inference_res.logits
419
+ logits = logits[:, -1, :]
420
+ return logits
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80f6d97a9e2f2adac9d378e29027bfc5672fe6321d26e52d467588ead5f41e7f
3
+ size 384461330
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "<unk>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": true,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "additional_special_tokens": [],
32
+ "bos_token": "<s>",
33
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<s>user\\n' + content + '</s>\\n<s>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '</s>' + '\\n' }}{% endif %}{% endfor %}",
34
+ "clean_up_tokenization_spaces": false,
35
+ "eos_token": "</s>",
36
+ "legacy": true,
37
+ "model_max_length": 1000000000000000019884624838656,
38
+ "pad_token": null,
39
+ "sp_model_kwargs": {},
40
+ "spaces_between_special_tokens": false,
41
+ "tokenizer_class": "PreTrainedTokenizerFast",
42
+ "unk_token": "<unk>",
43
+ "use_default_system_prompt": false
44
+ }