pkalkman commited on
Commit
a8ec9cc
·
verified ·
1 Parent(s): c3528b4

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +25 -14
  2. tokenizer.json +6 -15
  3. tokenizer_config.json +13 -17
special_tokens_map.json CHANGED
@@ -1,4 +1,27 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "bos_token": {
3
  "content": "<|endoftext|>",
4
  "lstrip": true,
@@ -13,18 +36,6 @@
13
  "rstrip": true,
14
  "single_word": false
15
  },
16
- "pad_token": {
17
- "content": "<|dummy_87|>",
18
- "lstrip": true,
19
- "normalized": false,
20
- "rstrip": true,
21
- "single_word": false
22
- },
23
- "unk_token": {
24
- "content": "�",
25
- "lstrip": false,
26
- "normalized": false,
27
- "rstrip": false,
28
- "single_word": false
29
- }
30
  }
 
1
  {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<|im_start|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<|im_sep|>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ ],
25
  "bos_token": {
26
  "content": "<|endoftext|>",
27
  "lstrip": true,
 
36
  "rstrip": true,
37
  "single_word": false
38
  },
39
+ "pad_token": "<|im_end|>",
40
+ "unk_token": "<|endoftext|>"
 
 
 
 
 
 
 
 
 
 
 
 
41
  }
tokenizer.json CHANGED
@@ -8,15 +8,6 @@
8
  },
9
  "padding": null,
10
  "added_tokens": [
11
- {
12
- "id": 5809,
13
- "content": "�",
14
- "single_word": false,
15
- "lstrip": false,
16
- "rstrip": false,
17
- "normalized": false,
18
- "special": true
19
- },
20
  {
21
  "id": 100256,
22
  "content": "<|dummy_0|>",
@@ -93,8 +84,8 @@
93
  "id": 100264,
94
  "content": "<|im_start|>",
95
  "single_word": false,
96
- "lstrip": true,
97
- "rstrip": true,
98
  "normalized": false,
99
  "special": true
100
  },
@@ -102,8 +93,8 @@
102
  "id": 100265,
103
  "content": "<|im_end|>",
104
  "single_word": false,
105
- "lstrip": true,
106
- "rstrip": true,
107
  "normalized": false,
108
  "special": true
109
  },
@@ -111,8 +102,8 @@
111
  "id": 100266,
112
  "content": "<|im_sep|>",
113
  "single_word": false,
114
- "lstrip": true,
115
- "rstrip": true,
116
  "normalized": false,
117
  "special": true
118
  },
 
8
  },
9
  "padding": null,
10
  "added_tokens": [
 
 
 
 
 
 
 
 
 
11
  {
12
  "id": 100256,
13
  "content": "<|dummy_0|>",
 
84
  "id": 100264,
85
  "content": "<|im_start|>",
86
  "single_word": false,
87
+ "lstrip": false,
88
+ "rstrip": false,
89
  "normalized": false,
90
  "special": true
91
  },
 
93
  "id": 100265,
94
  "content": "<|im_end|>",
95
  "single_word": false,
96
+ "lstrip": false,
97
+ "rstrip": false,
98
  "normalized": false,
99
  "special": true
100
  },
 
102
  "id": 100266,
103
  "content": "<|im_sep|>",
104
  "single_word": false,
105
+ "lstrip": false,
106
+ "rstrip": false,
107
  "normalized": false,
108
  "special": true
109
  },
tokenizer_config.json CHANGED
@@ -1,14 +1,6 @@
1
  {
2
  "add_prefix_space": false,
3
  "added_tokens_decoder": {
4
- "5809": {
5
- "content": "�",
6
- "lstrip": false,
7
- "normalized": false,
8
- "rstrip": false,
9
- "single_word": false,
10
- "special": true
11
- },
12
  "100256": {
13
  "content": "<|dummy_0|>",
14
  "lstrip": true,
@@ -75,25 +67,25 @@
75
  },
76
  "100264": {
77
  "content": "<|im_start|>",
78
- "lstrip": true,
79
  "normalized": false,
80
- "rstrip": true,
81
  "single_word": false,
82
  "special": true
83
  },
84
  "100265": {
85
  "content": "<|im_end|>",
86
- "lstrip": true,
87
  "normalized": false,
88
- "rstrip": true,
89
  "single_word": false,
90
  "special": true
91
  },
92
  "100266": {
93
  "content": "<|im_sep|>",
94
- "lstrip": true,
95
  "normalized": false,
96
- "rstrip": true,
97
  "single_word": false,
98
  "special": true
99
  },
@@ -778,14 +770,18 @@
778
  "special": true
779
  }
780
  },
 
 
 
 
 
781
  "bos_token": "<|endoftext|>",
782
  "chat_template": "{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|im_start|>system<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'user') %}{{'<|im_start|>user<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'assistant') %}{{'<|im_start|>assistant<|im_sep|>' + message['content'] + '<|im_end|>'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant<|im_sep|>' }}{% endif %}",
783
  "clean_up_tokenization_spaces": false,
784
  "eos_token": "<|im_end|>",
785
  "extra_special_tokens": {},
786
  "model_max_length": 16384,
787
- "pad_token": "<|dummy_87|>",
788
- "padding_side": "right",
789
  "tokenizer_class": "GPT2Tokenizer",
790
- "unk_token": "�"
791
  }
 
1
  {
2
  "add_prefix_space": false,
3
  "added_tokens_decoder": {
 
 
 
 
 
 
 
 
4
  "100256": {
5
  "content": "<|dummy_0|>",
6
  "lstrip": true,
 
67
  },
68
  "100264": {
69
  "content": "<|im_start|>",
70
+ "lstrip": false,
71
  "normalized": false,
72
+ "rstrip": false,
73
  "single_word": false,
74
  "special": true
75
  },
76
  "100265": {
77
  "content": "<|im_end|>",
78
+ "lstrip": false,
79
  "normalized": false,
80
+ "rstrip": false,
81
  "single_word": false,
82
  "special": true
83
  },
84
  "100266": {
85
  "content": "<|im_sep|>",
86
+ "lstrip": false,
87
  "normalized": false,
88
+ "rstrip": false,
89
  "single_word": false,
90
  "special": true
91
  },
 
770
  "special": true
771
  }
772
  },
773
+ "additional_special_tokens": [
774
+ "<|im_start|>",
775
+ "<|im_sep|>",
776
+ "<|im_end|>"
777
+ ],
778
  "bos_token": "<|endoftext|>",
779
  "chat_template": "{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|im_start|>system<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'user') %}{{'<|im_start|>user<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'assistant') %}{{'<|im_start|>assistant<|im_sep|>' + message['content'] + '<|im_end|>'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant<|im_sep|>' }}{% endif %}",
780
  "clean_up_tokenization_spaces": false,
781
  "eos_token": "<|im_end|>",
782
  "extra_special_tokens": {},
783
  "model_max_length": 16384,
784
+ "pad_token": "<|im_end|>",
 
785
  "tokenizer_class": "GPT2Tokenizer",
786
+ "unk_token": "<|endoftext|>"
787
  }