Narsil HF staff commited on
Commit
1e7bfc4
1 Parent(s): ff07dc0

Create the tokenizer.json properly (with TemplateProcessing included).

Browse files

Turns out, transformers will override the TemplateProcessing meaning in python land the bug is not seen.
https://github.com/huggingface/transformers/blame/main/src/transformers/models/llama/tokenization_llama_fast.py#L203-L205

However in Rust land, the definition is taken exactly from the tokenizer.json file, therefore it's missing the TemplateProcessor.

Adding it would fix the issue in rust land without affecting python land (since the override is still there)

Files changed (1) hide show
  1. tokenizer.json +31 -2
tokenizer.json CHANGED
@@ -128,7 +128,7 @@
128
  "rstrip": true,
129
  "normalized": false,
130
  "special": true
131
- }
132
  ],
133
  "normalizer": {
134
  "type": "Sequence",
@@ -150,6 +150,12 @@
150
  "post_processor": {
151
  "type": "TemplateProcessing",
152
  "single": [
 
 
 
 
 
 
153
  {
154
  "Sequence": {
155
  "id": "A",
@@ -158,12 +164,24 @@
158
  }
159
  ],
160
  "pair": [
 
 
 
 
 
 
161
  {
162
  "Sequence": {
163
  "id": "A",
164
  "type_id": 0
165
  }
166
  },
 
 
 
 
 
 
167
  {
168
  "Sequence": {
169
  "id": "B",
@@ -171,7 +189,17 @@
171
  }
172
  }
173
  ],
174
- "special_tokens": {}
 
 
 
 
 
 
 
 
 
 
175
  },
176
  "decoder": {
177
  "type": "Sequence",
@@ -205,6 +233,7 @@
205
  "end_of_word_suffix": null,
206
  "fuse_unk": true,
207
  "byte_fallback": true,
 
208
  "vocab": {
209
  "<unk>": 0,
210
  "<s>": 1,
 
128
  "rstrip": true,
129
  "normalized": false,
130
  "special": true
131
+ }
132
  ],
133
  "normalizer": {
134
  "type": "Sequence",
 
150
  "post_processor": {
151
  "type": "TemplateProcessing",
152
  "single": [
153
+ {
154
+ "SpecialToken": {
155
+ "id": "<s>",
156
+ "type_id": 0
157
+ }
158
+ },
159
  {
160
  "Sequence": {
161
  "id": "A",
 
164
  }
165
  ],
166
  "pair": [
167
+ {
168
+ "SpecialToken": {
169
+ "id": "<s>",
170
+ "type_id": 0
171
+ }
172
+ },
173
  {
174
  "Sequence": {
175
  "id": "A",
176
  "type_id": 0
177
  }
178
  },
179
+ {
180
+ "SpecialToken": {
181
+ "id": "<s>",
182
+ "type_id": 1
183
+ }
184
+ },
185
  {
186
  "Sequence": {
187
  "id": "B",
 
189
  }
190
  }
191
  ],
192
+ "special_tokens": {
193
+ "<s>": {
194
+ "id": "<s>",
195
+ "ids": [
196
+ 1
197
+ ],
198
+ "tokens": [
199
+ "<s>"
200
+ ]
201
+ }
202
+ }
203
  },
204
  "decoder": {
205
  "type": "Sequence",
 
233
  "end_of_word_suffix": null,
234
  "fuse_unk": true,
235
  "byte_fallback": true,
236
+ "ignore_merges": false,
237
  "vocab": {
238
  "<unk>": 0,
239
  "<s>": 1,