alonj commited on
Commit
d9adef4
·
verified ·
1 Parent(s): 8447e2c

Upload tokenizer

Browse files
Files changed (4) hide show
  1. added_tokens.json +22 -34
  2. special_tokens_map.json +43 -28
  3. tokenizer.json +87 -123
  4. tokenizer_config.json +123 -146
added_tokens.json CHANGED
@@ -1,36 +1,24 @@
1
  {
2
- "ADJ": 49152,
3
- "ADP": 49153,
4
- "ADV": 49154,
5
- "AUX": 49155,
6
- "INTJ": 49156,
7
- "JJ": 49165,
8
- "JJR": 49166,
9
- "JJS": 49167,
10
- "NNP": 49169,
11
- "NNPS": 49170,
12
- "NNS": 49168,
13
- "NOUN": 49157,
14
- "PART": 49158,
15
- "PDT": 49171,
16
- "PRON": 49159,
17
- "PROPN": 49160,
18
- "PRP": 49172,
19
- "PRP$": 49173,
20
- "PUNCT": 49161,
21
- "RBR": 49174,
22
- "RBS": 49175,
23
- "SCONJ": 49162,
24
- "SYM": 49163,
25
- "UH": 49176,
26
- "VB": 49177,
27
- "VBD": 49178,
28
- "VBG": 49179,
29
- "VBN": 49180,
30
- "VBP": 49181,
31
- "VBZ": 49182,
32
- "VERB": 49164,
33
- "WDT": 49183,
34
- "WP$": 49184,
35
- "WRB": 49185
36
  }
 
1
  {
2
+ "JJ": 49157,
3
+ "JJR": 49170,
4
+ "JJS": 49169,
5
+ "NNP": 49162,
6
+ "NNPS": 49168,
7
+ "NNS": 49161,
8
+ "PDT": 49164,
9
+ "PRP": 49166,
10
+ "PRP$": 49152,
11
+ "RBR": 49159,
12
+ "RBS": 49165,
13
+ "SYM": 49171,
14
+ "UH": 49172,
15
+ "VB": 49154,
16
+ "VBD": 49160,
17
+ "VBG": 49153,
18
+ "VBN": 49156,
19
+ "VBP": 49155,
20
+ "VBZ": 49158,
21
+ "WDT": 49173,
22
+ "WP$": 49167,
23
+ "WRB": 49163
 
 
 
 
 
 
 
 
 
 
 
 
24
  }
special_tokens_map.json CHANGED
@@ -1,41 +1,50 @@
1
  {
2
  "additional_special_tokens": [
3
- "CC",
4
- "CD",
5
- "DT",
6
- "EX",
7
  "FW",
8
- "IN",
 
 
 
 
9
  "JJ",
10
- "JJR",
11
- "JJS",
12
- "LS",
13
- "MD",
 
14
  "NN",
 
 
 
 
 
 
 
 
 
 
15
  "NNS",
16
  "NNP",
17
- "NNPS",
 
 
18
  "PDT",
19
- "POS",
20
- "PRP",
21
- "PRP$",
22
- "RB",
23
- "RBR",
24
  "RBS",
25
- "RP",
 
 
 
 
 
 
 
 
 
26
  "SYM",
27
- "TO",
28
  "UH",
29
- "VB",
30
- "VBD",
31
- "VBG",
32
- "VBN",
33
- "VBP",
34
- "VBZ",
35
- "WDT",
36
- "WP",
37
- "WP$",
38
- "WRB"
39
  ],
40
  "bos_token": {
41
  "content": "<|endoftext|>",
@@ -51,7 +60,13 @@
51
  "rstrip": false,
52
  "single_word": false
53
  },
54
- "pad_token": "<|endoftext|>",
 
 
 
 
 
 
55
  "unk_token": {
56
  "content": "<|endoftext|>",
57
  "lstrip": false,
 
1
  {
2
  "additional_special_tokens": [
3
+ "PRP$",
4
+ "VBG",
 
 
5
  "FW",
6
+ "VB",
7
+ "POS",
8
+ "''",
9
+ "VBP",
10
+ "VBN",
11
  "JJ",
12
+ "WP",
13
+ "VBZ",
14
+ "DT",
15
+ "RP",
16
+ "$",
17
  "NN",
18
+ ")",
19
+ "(",
20
+ "RBR",
21
+ "VBD",
22
+ ",",
23
+ ".",
24
+ "TO",
25
+ "LS",
26
+ "RB",
27
+ ":",
28
  "NNS",
29
  "NNP",
30
+ "``",
31
+ "WRB",
32
+ "CC",
33
  "PDT",
 
 
 
 
 
34
  "RBS",
35
+ "PRP",
36
+ "CD",
37
+ "EX",
38
+ "IN",
39
+ "WP$",
40
+ "MD",
41
+ "NNPS",
42
+ "--",
43
+ "JJS",
44
+ "JJR",
45
  "SYM",
 
46
  "UH",
47
+ "WDT"
 
 
 
 
 
 
 
 
 
48
  ],
49
  "bos_token": {
50
  "content": "<|endoftext|>",
 
60
  "rstrip": false,
61
  "single_word": false
62
  },
63
+ "pad_token": {
64
+ "content": "<|endoftext|>",
65
+ "lstrip": false,
66
+ "normalized": false,
67
+ "rstrip": false,
68
+ "single_word": false
69
+ },
70
  "unk_token": {
71
  "content": "<|endoftext|>",
72
  "lstrip": false,
tokenizer.json CHANGED
@@ -157,8 +157,8 @@
157
  "special": true
158
  },
159
  {
160
- "id": 2113,
161
- "content": "IN",
162
  "single_word": false,
163
  "lstrip": false,
164
  "rstrip": false,
@@ -166,8 +166,8 @@
166
  "special": true
167
  },
168
  {
169
- "id": 5137,
170
- "content": "CC",
171
  "single_word": false,
172
  "lstrip": false,
173
  "rstrip": false,
@@ -175,8 +175,8 @@
175
  "special": true
176
  },
177
  {
178
- "id": 6998,
179
- "content": "CD",
180
  "single_word": false,
181
  "lstrip": false,
182
  "rstrip": false,
@@ -184,8 +184,8 @@
184
  "special": true
185
  },
186
  {
187
- "id": 9324,
188
- "content": "EX",
189
  "single_word": false,
190
  "lstrip": false,
191
  "rstrip": false,
@@ -193,8 +193,8 @@
193
  "special": true
194
  },
195
  {
196
- "id": 9815,
197
- "content": "MD",
198
  "single_word": false,
199
  "lstrip": false,
200
  "rstrip": false,
@@ -202,8 +202,8 @@
202
  "special": true
203
  },
204
  {
205
- "id": 10179,
206
- "content": "TO",
207
  "single_word": false,
208
  "lstrip": false,
209
  "rstrip": false,
@@ -211,8 +211,8 @@
211
  "special": true
212
  },
213
  {
214
- "id": 12041,
215
- "content": "LS",
216
  "single_word": false,
217
  "lstrip": false,
218
  "rstrip": false,
@@ -220,8 +220,8 @@
220
  "special": true
221
  },
222
  {
223
- "id": 17679,
224
- "content": "DT",
225
  "single_word": false,
226
  "lstrip": false,
227
  "rstrip": false,
@@ -229,8 +229,8 @@
229
  "special": true
230
  },
231
  {
232
- "id": 19668,
233
- "content": "NN",
234
  "single_word": false,
235
  "lstrip": false,
236
  "rstrip": false,
@@ -238,8 +238,8 @@
238
  "special": true
239
  },
240
  {
241
- "id": 22259,
242
- "content": "POS",
243
  "single_word": false,
244
  "lstrip": false,
245
  "rstrip": false,
@@ -247,8 +247,8 @@
247
  "special": true
248
  },
249
  {
250
- "id": 25398,
251
- "content": "NUM",
252
  "single_word": false,
253
  "lstrip": false,
254
  "rstrip": false,
@@ -256,8 +256,8 @@
256
  "special": true
257
  },
258
  {
259
- "id": 29482,
260
- "content": "RP",
261
  "single_word": false,
262
  "lstrip": false,
263
  "rstrip": false,
@@ -265,8 +265,8 @@
265
  "special": true
266
  },
267
  {
268
- "id": 35991,
269
- "content": "FW",
270
  "single_word": false,
271
  "lstrip": false,
272
  "rstrip": false,
@@ -274,8 +274,8 @@
274
  "special": true
275
  },
276
  {
277
- "id": 45780,
278
- "content": "RB",
279
  "single_word": false,
280
  "lstrip": false,
281
  "rstrip": false,
@@ -283,8 +283,8 @@
283
  "special": true
284
  },
285
  {
286
- "id": 47306,
287
- "content": "WP",
288
  "single_word": false,
289
  "lstrip": false,
290
  "rstrip": false,
@@ -292,8 +292,8 @@
292
  "special": true
293
  },
294
  {
295
- "id": 49152,
296
- "content": "ADJ",
297
  "single_word": false,
298
  "lstrip": false,
299
  "rstrip": false,
@@ -301,8 +301,8 @@
301
  "special": true
302
  },
303
  {
304
- "id": 49153,
305
- "content": "ADP",
306
  "single_word": false,
307
  "lstrip": false,
308
  "rstrip": false,
@@ -310,8 +310,8 @@
310
  "special": true
311
  },
312
  {
313
- "id": 49154,
314
- "content": "ADV",
315
  "single_word": false,
316
  "lstrip": false,
317
  "rstrip": false,
@@ -319,8 +319,8 @@
319
  "special": true
320
  },
321
  {
322
- "id": 49155,
323
- "content": "AUX",
324
  "single_word": false,
325
  "lstrip": false,
326
  "rstrip": false,
@@ -328,8 +328,8 @@
328
  "special": true
329
  },
330
  {
331
- "id": 49156,
332
- "content": "INTJ",
333
  "single_word": false,
334
  "lstrip": false,
335
  "rstrip": false,
@@ -337,8 +337,8 @@
337
  "special": true
338
  },
339
  {
340
- "id": 49157,
341
- "content": "NOUN",
342
  "single_word": false,
343
  "lstrip": false,
344
  "rstrip": false,
@@ -346,8 +346,8 @@
346
  "special": true
347
  },
348
  {
349
- "id": 49158,
350
- "content": "PART",
351
  "single_word": false,
352
  "lstrip": false,
353
  "rstrip": false,
@@ -355,8 +355,8 @@
355
  "special": true
356
  },
357
  {
358
- "id": 49159,
359
- "content": "PRON",
360
  "single_word": false,
361
  "lstrip": false,
362
  "rstrip": false,
@@ -364,8 +364,8 @@
364
  "special": true
365
  },
366
  {
367
- "id": 49160,
368
- "content": "PROPN",
369
  "single_word": false,
370
  "lstrip": false,
371
  "rstrip": false,
@@ -373,8 +373,8 @@
373
  "special": true
374
  },
375
  {
376
- "id": 49161,
377
- "content": "PUNCT",
378
  "single_word": false,
379
  "lstrip": false,
380
  "rstrip": false,
@@ -382,8 +382,8 @@
382
  "special": true
383
  },
384
  {
385
- "id": 49162,
386
- "content": "SCONJ",
387
  "single_word": false,
388
  "lstrip": false,
389
  "rstrip": false,
@@ -391,8 +391,8 @@
391
  "special": true
392
  },
393
  {
394
- "id": 49163,
395
- "content": "SYM",
396
  "single_word": false,
397
  "lstrip": false,
398
  "rstrip": false,
@@ -400,8 +400,8 @@
400
  "special": true
401
  },
402
  {
403
- "id": 49164,
404
- "content": "VERB",
405
  "single_word": false,
406
  "lstrip": false,
407
  "rstrip": false,
@@ -409,7 +409,7 @@
409
  "special": true
410
  },
411
  {
412
- "id": 49165,
413
  "content": "JJ",
414
  "single_word": false,
415
  "lstrip": false,
@@ -418,26 +418,8 @@
418
  "special": true
419
  },
420
  {
421
- "id": 49166,
422
- "content": "JJR",
423
- "single_word": false,
424
- "lstrip": false,
425
- "rstrip": false,
426
- "normalized": false,
427
- "special": true
428
- },
429
- {
430
- "id": 49167,
431
- "content": "JJS",
432
- "single_word": false,
433
- "lstrip": false,
434
- "rstrip": false,
435
- "normalized": false,
436
- "special": true
437
- },
438
- {
439
- "id": 49168,
440
- "content": "NNS",
441
  "single_word": false,
442
  "lstrip": false,
443
  "rstrip": false,
@@ -445,8 +427,8 @@
445
  "special": true
446
  },
447
  {
448
- "id": 49169,
449
- "content": "NNP",
450
  "single_word": false,
451
  "lstrip": false,
452
  "rstrip": false,
@@ -454,8 +436,8 @@
454
  "special": true
455
  },
456
  {
457
- "id": 49170,
458
- "content": "NNPS",
459
  "single_word": false,
460
  "lstrip": false,
461
  "rstrip": false,
@@ -463,8 +445,8 @@
463
  "special": true
464
  },
465
  {
466
- "id": 49171,
467
- "content": "PDT",
468
  "single_word": false,
469
  "lstrip": false,
470
  "rstrip": false,
@@ -472,8 +454,8 @@
472
  "special": true
473
  },
474
  {
475
- "id": 49172,
476
- "content": "PRP",
477
  "single_word": false,
478
  "lstrip": false,
479
  "rstrip": false,
@@ -481,8 +463,8 @@
481
  "special": true
482
  },
483
  {
484
- "id": 49173,
485
- "content": "PRP$",
486
  "single_word": false,
487
  "lstrip": false,
488
  "rstrip": false,
@@ -490,8 +472,8 @@
490
  "special": true
491
  },
492
  {
493
- "id": 49174,
494
- "content": "RBR",
495
  "single_word": false,
496
  "lstrip": false,
497
  "rstrip": false,
@@ -499,7 +481,7 @@
499
  "special": true
500
  },
501
  {
502
- "id": 49175,
503
  "content": "RBS",
504
  "single_word": false,
505
  "lstrip": false,
@@ -508,8 +490,8 @@
508
  "special": true
509
  },
510
  {
511
- "id": 49176,
512
- "content": "UH",
513
  "single_word": false,
514
  "lstrip": false,
515
  "rstrip": false,
@@ -517,8 +499,8 @@
517
  "special": true
518
  },
519
  {
520
- "id": 49177,
521
- "content": "VB",
522
  "single_word": false,
523
  "lstrip": false,
524
  "rstrip": false,
@@ -526,8 +508,8 @@
526
  "special": true
527
  },
528
  {
529
- "id": 49178,
530
- "content": "VBD",
531
  "single_word": false,
532
  "lstrip": false,
533
  "rstrip": false,
@@ -535,8 +517,8 @@
535
  "special": true
536
  },
537
  {
538
- "id": 49179,
539
- "content": "VBG",
540
  "single_word": false,
541
  "lstrip": false,
542
  "rstrip": false,
@@ -544,8 +526,8 @@
544
  "special": true
545
  },
546
  {
547
- "id": 49180,
548
- "content": "VBN",
549
  "single_word": false,
550
  "lstrip": false,
551
  "rstrip": false,
@@ -553,8 +535,8 @@
553
  "special": true
554
  },
555
  {
556
- "id": 49181,
557
- "content": "VBP",
558
  "single_word": false,
559
  "lstrip": false,
560
  "rstrip": false,
@@ -562,8 +544,8 @@
562
  "special": true
563
  },
564
  {
565
- "id": 49182,
566
- "content": "VBZ",
567
  "single_word": false,
568
  "lstrip": false,
569
  "rstrip": false,
@@ -571,31 +553,13 @@
571
  "special": true
572
  },
573
  {
574
- "id": 49183,
575
  "content": "WDT",
576
  "single_word": false,
577
  "lstrip": false,
578
  "rstrip": false,
579
  "normalized": false,
580
  "special": true
581
- },
582
- {
583
- "id": 49184,
584
- "content": "WP$",
585
- "single_word": false,
586
- "lstrip": false,
587
- "rstrip": false,
588
- "normalized": false,
589
- "special": true
590
- },
591
- {
592
- "id": 49185,
593
- "content": "WRB",
594
- "single_word": false,
595
- "lstrip": false,
596
- "rstrip": false,
597
- "normalized": false,
598
- "special": true
599
  }
600
  ],
601
  "normalizer": null,
 
157
  "special": true
158
  },
159
  {
160
+ "id": 20,
161
+ "content": "$",
162
  "single_word": false,
163
  "lstrip": false,
164
  "rstrip": false,
 
166
  "special": true
167
  },
168
  {
169
+ "id": 24,
170
+ "content": "(",
171
  "single_word": false,
172
  "lstrip": false,
173
  "rstrip": false,
 
175
  "special": true
176
  },
177
  {
178
+ "id": 25,
179
+ "content": ")",
180
  "single_word": false,
181
  "lstrip": false,
182
  "rstrip": false,
 
184
  "special": true
185
  },
186
  {
187
+ "id": 28,
188
+ "content": ",",
189
  "single_word": false,
190
  "lstrip": false,
191
  "rstrip": false,
 
193
  "special": true
194
  },
195
  {
196
+ "id": 30,
197
+ "content": ".",
198
  "single_word": false,
199
  "lstrip": false,
200
  "rstrip": false,
 
202
  "special": true
203
  },
204
  {
205
+ "id": 42,
206
+ "content": ":",
207
  "single_word": false,
208
  "lstrip": false,
209
  "rstrip": false,
 
211
  "special": true
212
  },
213
  {
214
+ "id": 423,
215
+ "content": "--",
216
  "single_word": false,
217
  "lstrip": false,
218
  "rstrip": false,
 
220
  "special": true
221
  },
222
  {
223
+ "id": 1969,
224
+ "content": "``",
225
  "single_word": false,
226
  "lstrip": false,
227
  "rstrip": false,
 
229
  "special": true
230
  },
231
  {
232
+ "id": 2113,
233
+ "content": "IN",
234
  "single_word": false,
235
  "lstrip": false,
236
  "rstrip": false,
 
238
  "special": true
239
  },
240
  {
241
+ "id": 3816,
242
+ "content": "''",
243
  "single_word": false,
244
  "lstrip": false,
245
  "rstrip": false,
 
247
  "special": true
248
  },
249
  {
250
+ "id": 5137,
251
+ "content": "CC",
252
  "single_word": false,
253
  "lstrip": false,
254
  "rstrip": false,
 
256
  "special": true
257
  },
258
  {
259
+ "id": 6998,
260
+ "content": "CD",
261
  "single_word": false,
262
  "lstrip": false,
263
  "rstrip": false,
 
265
  "special": true
266
  },
267
  {
268
+ "id": 9324,
269
+ "content": "EX",
270
  "single_word": false,
271
  "lstrip": false,
272
  "rstrip": false,
 
274
  "special": true
275
  },
276
  {
277
+ "id": 9815,
278
+ "content": "MD",
279
  "single_word": false,
280
  "lstrip": false,
281
  "rstrip": false,
 
283
  "special": true
284
  },
285
  {
286
+ "id": 10179,
287
+ "content": "TO",
288
  "single_word": false,
289
  "lstrip": false,
290
  "rstrip": false,
 
292
  "special": true
293
  },
294
  {
295
+ "id": 12041,
296
+ "content": "LS",
297
  "single_word": false,
298
  "lstrip": false,
299
  "rstrip": false,
 
301
  "special": true
302
  },
303
  {
304
+ "id": 17679,
305
+ "content": "DT",
306
  "single_word": false,
307
  "lstrip": false,
308
  "rstrip": false,
 
310
  "special": true
311
  },
312
  {
313
+ "id": 19668,
314
+ "content": "NN",
315
  "single_word": false,
316
  "lstrip": false,
317
  "rstrip": false,
 
319
  "special": true
320
  },
321
  {
322
+ "id": 22259,
323
+ "content": "POS",
324
  "single_word": false,
325
  "lstrip": false,
326
  "rstrip": false,
 
328
  "special": true
329
  },
330
  {
331
+ "id": 29482,
332
+ "content": "RP",
333
  "single_word": false,
334
  "lstrip": false,
335
  "rstrip": false,
 
337
  "special": true
338
  },
339
  {
340
+ "id": 35991,
341
+ "content": "FW",
342
  "single_word": false,
343
  "lstrip": false,
344
  "rstrip": false,
 
346
  "special": true
347
  },
348
  {
349
+ "id": 45780,
350
+ "content": "RB",
351
  "single_word": false,
352
  "lstrip": false,
353
  "rstrip": false,
 
355
  "special": true
356
  },
357
  {
358
+ "id": 47306,
359
+ "content": "WP",
360
  "single_word": false,
361
  "lstrip": false,
362
  "rstrip": false,
 
364
  "special": true
365
  },
366
  {
367
+ "id": 49152,
368
+ "content": "PRP$",
369
  "single_word": false,
370
  "lstrip": false,
371
  "rstrip": false,
 
373
  "special": true
374
  },
375
  {
376
+ "id": 49153,
377
+ "content": "VBG",
378
  "single_word": false,
379
  "lstrip": false,
380
  "rstrip": false,
 
382
  "special": true
383
  },
384
  {
385
+ "id": 49154,
386
+ "content": "VB",
387
  "single_word": false,
388
  "lstrip": false,
389
  "rstrip": false,
 
391
  "special": true
392
  },
393
  {
394
+ "id": 49155,
395
+ "content": "VBP",
396
  "single_word": false,
397
  "lstrip": false,
398
  "rstrip": false,
 
400
  "special": true
401
  },
402
  {
403
+ "id": 49156,
404
+ "content": "VBN",
405
  "single_word": false,
406
  "lstrip": false,
407
  "rstrip": false,
 
409
  "special": true
410
  },
411
  {
412
+ "id": 49157,
413
  "content": "JJ",
414
  "single_word": false,
415
  "lstrip": false,
 
418
  "special": true
419
  },
420
  {
421
+ "id": 49158,
422
+ "content": "VBZ",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
423
  "single_word": false,
424
  "lstrip": false,
425
  "rstrip": false,
 
427
  "special": true
428
  },
429
  {
430
+ "id": 49159,
431
+ "content": "RBR",
432
  "single_word": false,
433
  "lstrip": false,
434
  "rstrip": false,
 
436
  "special": true
437
  },
438
  {
439
+ "id": 49160,
440
+ "content": "VBD",
441
  "single_word": false,
442
  "lstrip": false,
443
  "rstrip": false,
 
445
  "special": true
446
  },
447
  {
448
+ "id": 49161,
449
+ "content": "NNS",
450
  "single_word": false,
451
  "lstrip": false,
452
  "rstrip": false,
 
454
  "special": true
455
  },
456
  {
457
+ "id": 49162,
458
+ "content": "NNP",
459
  "single_word": false,
460
  "lstrip": false,
461
  "rstrip": false,
 
463
  "special": true
464
  },
465
  {
466
+ "id": 49163,
467
+ "content": "WRB",
468
  "single_word": false,
469
  "lstrip": false,
470
  "rstrip": false,
 
472
  "special": true
473
  },
474
  {
475
+ "id": 49164,
476
+ "content": "PDT",
477
  "single_word": false,
478
  "lstrip": false,
479
  "rstrip": false,
 
481
  "special": true
482
  },
483
  {
484
+ "id": 49165,
485
  "content": "RBS",
486
  "single_word": false,
487
  "lstrip": false,
 
490
  "special": true
491
  },
492
  {
493
+ "id": 49166,
494
+ "content": "PRP",
495
  "single_word": false,
496
  "lstrip": false,
497
  "rstrip": false,
 
499
  "special": true
500
  },
501
  {
502
+ "id": 49167,
503
+ "content": "WP$",
504
  "single_word": false,
505
  "lstrip": false,
506
  "rstrip": false,
 
508
  "special": true
509
  },
510
  {
511
+ "id": 49168,
512
+ "content": "NNPS",
513
  "single_word": false,
514
  "lstrip": false,
515
  "rstrip": false,
 
517
  "special": true
518
  },
519
  {
520
+ "id": 49169,
521
+ "content": "JJS",
522
  "single_word": false,
523
  "lstrip": false,
524
  "rstrip": false,
 
526
  "special": true
527
  },
528
  {
529
+ "id": 49170,
530
+ "content": "JJR",
531
  "single_word": false,
532
  "lstrip": false,
533
  "rstrip": false,
 
535
  "special": true
536
  },
537
  {
538
+ "id": 49171,
539
+ "content": "SYM",
540
  "single_word": false,
541
  "lstrip": false,
542
  "rstrip": false,
 
544
  "special": true
545
  },
546
  {
547
+ "id": 49172,
548
+ "content": "UH",
549
  "single_word": false,
550
  "lstrip": false,
551
  "rstrip": false,
 
553
  "special": true
554
  },
555
  {
556
+ "id": 49173,
557
  "content": "WDT",
558
  "single_word": false,
559
  "lstrip": false,
560
  "rstrip": false,
561
  "normalized": false,
562
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
563
  }
564
  ],
565
  "normalizer": null,
tokenizer_config.json CHANGED
@@ -137,231 +137,231 @@
137
  "single_word": false,
138
  "special": true
139
  },
140
- "2113": {
141
- "content": "IN",
142
  "lstrip": false,
143
  "normalized": false,
144
  "rstrip": false,
145
  "single_word": false,
146
  "special": true
147
  },
148
- "5137": {
149
- "content": "CC",
150
  "lstrip": false,
151
  "normalized": false,
152
  "rstrip": false,
153
  "single_word": false,
154
  "special": true
155
  },
156
- "6998": {
157
- "content": "CD",
158
  "lstrip": false,
159
  "normalized": false,
160
  "rstrip": false,
161
  "single_word": false,
162
  "special": true
163
  },
164
- "9324": {
165
- "content": "EX",
166
  "lstrip": false,
167
  "normalized": false,
168
  "rstrip": false,
169
  "single_word": false,
170
  "special": true
171
  },
172
- "9815": {
173
- "content": "MD",
174
  "lstrip": false,
175
  "normalized": false,
176
  "rstrip": false,
177
  "single_word": false,
178
  "special": true
179
  },
180
- "10179": {
181
- "content": "TO",
182
  "lstrip": false,
183
  "normalized": false,
184
  "rstrip": false,
185
  "single_word": false,
186
  "special": true
187
  },
188
- "12041": {
189
- "content": "LS",
190
  "lstrip": false,
191
  "normalized": false,
192
  "rstrip": false,
193
  "single_word": false,
194
  "special": true
195
  },
196
- "17679": {
197
- "content": "DT",
198
  "lstrip": false,
199
  "normalized": false,
200
  "rstrip": false,
201
  "single_word": false,
202
  "special": true
203
  },
204
- "19668": {
205
- "content": "NN",
206
  "lstrip": false,
207
  "normalized": false,
208
  "rstrip": false,
209
  "single_word": false,
210
  "special": true
211
  },
212
- "22259": {
213
- "content": "POS",
214
  "lstrip": false,
215
  "normalized": false,
216
  "rstrip": false,
217
  "single_word": false,
218
  "special": true
219
  },
220
- "25398": {
221
- "content": "NUM",
222
  "lstrip": false,
223
  "normalized": false,
224
  "rstrip": false,
225
  "single_word": false,
226
  "special": true
227
  },
228
- "29482": {
229
- "content": "RP",
230
  "lstrip": false,
231
  "normalized": false,
232
  "rstrip": false,
233
  "single_word": false,
234
  "special": true
235
  },
236
- "35991": {
237
- "content": "FW",
238
  "lstrip": false,
239
  "normalized": false,
240
  "rstrip": false,
241
  "single_word": false,
242
  "special": true
243
  },
244
- "45780": {
245
- "content": "RB",
246
  "lstrip": false,
247
  "normalized": false,
248
  "rstrip": false,
249
  "single_word": false,
250
  "special": true
251
  },
252
- "47306": {
253
- "content": "WP",
254
  "lstrip": false,
255
  "normalized": false,
256
  "rstrip": false,
257
  "single_word": false,
258
  "special": true
259
  },
260
- "49152": {
261
- "content": "ADJ",
262
  "lstrip": false,
263
  "normalized": false,
264
  "rstrip": false,
265
  "single_word": false,
266
  "special": true
267
  },
268
- "49153": {
269
- "content": "ADP",
270
  "lstrip": false,
271
  "normalized": false,
272
  "rstrip": false,
273
  "single_word": false,
274
  "special": true
275
  },
276
- "49154": {
277
- "content": "ADV",
278
  "lstrip": false,
279
  "normalized": false,
280
  "rstrip": false,
281
  "single_word": false,
282
  "special": true
283
  },
284
- "49155": {
285
- "content": "AUX",
286
  "lstrip": false,
287
  "normalized": false,
288
  "rstrip": false,
289
  "single_word": false,
290
  "special": true
291
  },
292
- "49156": {
293
- "content": "INTJ",
294
  "lstrip": false,
295
  "normalized": false,
296
  "rstrip": false,
297
  "single_word": false,
298
  "special": true
299
  },
300
- "49157": {
301
- "content": "NOUN",
302
  "lstrip": false,
303
  "normalized": false,
304
  "rstrip": false,
305
  "single_word": false,
306
  "special": true
307
  },
308
- "49158": {
309
- "content": "PART",
310
  "lstrip": false,
311
  "normalized": false,
312
  "rstrip": false,
313
  "single_word": false,
314
  "special": true
315
  },
316
- "49159": {
317
- "content": "PRON",
318
  "lstrip": false,
319
  "normalized": false,
320
  "rstrip": false,
321
  "single_word": false,
322
  "special": true
323
  },
324
- "49160": {
325
- "content": "PROPN",
326
  "lstrip": false,
327
  "normalized": false,
328
  "rstrip": false,
329
  "single_word": false,
330
  "special": true
331
  },
332
- "49161": {
333
- "content": "PUNCT",
334
  "lstrip": false,
335
  "normalized": false,
336
  "rstrip": false,
337
  "single_word": false,
338
  "special": true
339
  },
340
- "49162": {
341
- "content": "SCONJ",
342
  "lstrip": false,
343
  "normalized": false,
344
  "rstrip": false,
345
  "single_word": false,
346
  "special": true
347
  },
348
- "49163": {
349
- "content": "SYM",
350
  "lstrip": false,
351
  "normalized": false,
352
  "rstrip": false,
353
  "single_word": false,
354
  "special": true
355
  },
356
- "49164": {
357
- "content": "VERB",
358
  "lstrip": false,
359
  "normalized": false,
360
  "rstrip": false,
361
  "single_word": false,
362
  "special": true
363
  },
364
- "49165": {
365
  "content": "JJ",
366
  "lstrip": false,
367
  "normalized": false,
@@ -369,79 +369,63 @@
369
  "single_word": false,
370
  "special": true
371
  },
372
- "49166": {
373
- "content": "JJR",
374
- "lstrip": false,
375
- "normalized": false,
376
- "rstrip": false,
377
- "single_word": false,
378
- "special": true
379
- },
380
- "49167": {
381
- "content": "JJS",
382
- "lstrip": false,
383
- "normalized": false,
384
- "rstrip": false,
385
- "single_word": false,
386
- "special": true
387
- },
388
- "49168": {
389
- "content": "NNS",
390
  "lstrip": false,
391
  "normalized": false,
392
  "rstrip": false,
393
  "single_word": false,
394
  "special": true
395
  },
396
- "49169": {
397
- "content": "NNP",
398
  "lstrip": false,
399
  "normalized": false,
400
  "rstrip": false,
401
  "single_word": false,
402
  "special": true
403
  },
404
- "49170": {
405
- "content": "NNPS",
406
  "lstrip": false,
407
  "normalized": false,
408
  "rstrip": false,
409
  "single_word": false,
410
  "special": true
411
  },
412
- "49171": {
413
- "content": "PDT",
414
  "lstrip": false,
415
  "normalized": false,
416
  "rstrip": false,
417
  "single_word": false,
418
  "special": true
419
  },
420
- "49172": {
421
- "content": "PRP",
422
  "lstrip": false,
423
  "normalized": false,
424
  "rstrip": false,
425
  "single_word": false,
426
  "special": true
427
  },
428
- "49173": {
429
- "content": "PRP$",
430
  "lstrip": false,
431
  "normalized": false,
432
  "rstrip": false,
433
  "single_word": false,
434
  "special": true
435
  },
436
- "49174": {
437
- "content": "RBR",
438
  "lstrip": false,
439
  "normalized": false,
440
  "rstrip": false,
441
  "single_word": false,
442
  "special": true
443
  },
444
- "49175": {
445
  "content": "RBS",
446
  "lstrip": false,
447
  "normalized": false,
@@ -449,124 +433,117 @@
449
  "single_word": false,
450
  "special": true
451
  },
452
- "49176": {
453
- "content": "UH",
454
  "lstrip": false,
455
  "normalized": false,
456
  "rstrip": false,
457
  "single_word": false,
458
  "special": true
459
  },
460
- "49177": {
461
- "content": "VB",
462
  "lstrip": false,
463
  "normalized": false,
464
  "rstrip": false,
465
  "single_word": false,
466
  "special": true
467
  },
468
- "49178": {
469
- "content": "VBD",
470
  "lstrip": false,
471
  "normalized": false,
472
  "rstrip": false,
473
  "single_word": false,
474
  "special": true
475
  },
476
- "49179": {
477
- "content": "VBG",
478
  "lstrip": false,
479
  "normalized": false,
480
  "rstrip": false,
481
  "single_word": false,
482
  "special": true
483
  },
484
- "49180": {
485
- "content": "VBN",
486
  "lstrip": false,
487
  "normalized": false,
488
  "rstrip": false,
489
  "single_word": false,
490
  "special": true
491
  },
492
- "49181": {
493
- "content": "VBP",
494
  "lstrip": false,
495
  "normalized": false,
496
  "rstrip": false,
497
  "single_word": false,
498
  "special": true
499
  },
500
- "49182": {
501
- "content": "VBZ",
502
  "lstrip": false,
503
  "normalized": false,
504
  "rstrip": false,
505
  "single_word": false,
506
  "special": true
507
  },
508
- "49183": {
509
  "content": "WDT",
510
  "lstrip": false,
511
  "normalized": false,
512
  "rstrip": false,
513
  "single_word": false,
514
  "special": true
515
- },
516
- "49184": {
517
- "content": "WP$",
518
- "lstrip": false,
519
- "normalized": false,
520
- "rstrip": false,
521
- "single_word": false,
522
- "special": true
523
- },
524
- "49185": {
525
- "content": "WRB",
526
- "lstrip": false,
527
- "normalized": false,
528
- "rstrip": false,
529
- "single_word": false,
530
- "special": true
531
  }
532
  },
533
  "additional_special_tokens": [
534
- "CC",
535
- "CD",
536
- "DT",
537
- "EX",
538
  "FW",
539
- "IN",
 
 
 
 
540
  "JJ",
541
- "JJR",
542
- "JJS",
543
- "LS",
544
- "MD",
 
545
  "NN",
 
 
 
 
 
 
 
 
 
 
546
  "NNS",
547
  "NNP",
548
- "NNPS",
 
 
549
  "PDT",
550
- "POS",
551
- "PRP",
552
- "PRP$",
553
- "RB",
554
- "RBR",
555
  "RBS",
556
- "RP",
 
 
 
 
 
 
 
 
 
557
  "SYM",
558
- "TO",
559
  "UH",
560
- "VB",
561
- "VBD",
562
- "VBG",
563
- "VBN",
564
- "VBP",
565
- "VBZ",
566
- "WDT",
567
- "WP",
568
- "WP$",
569
- "WRB"
570
  ],
571
  "bos_token": "<|endoftext|>",
572
  "clean_up_tokenization_spaces": false,
 
137
  "single_word": false,
138
  "special": true
139
  },
140
+ "20": {
141
+ "content": "$",
142
  "lstrip": false,
143
  "normalized": false,
144
  "rstrip": false,
145
  "single_word": false,
146
  "special": true
147
  },
148
+ "24": {
149
+ "content": "(",
150
  "lstrip": false,
151
  "normalized": false,
152
  "rstrip": false,
153
  "single_word": false,
154
  "special": true
155
  },
156
+ "25": {
157
+ "content": ")",
158
  "lstrip": false,
159
  "normalized": false,
160
  "rstrip": false,
161
  "single_word": false,
162
  "special": true
163
  },
164
+ "28": {
165
+ "content": ",",
166
  "lstrip": false,
167
  "normalized": false,
168
  "rstrip": false,
169
  "single_word": false,
170
  "special": true
171
  },
172
+ "30": {
173
+ "content": ".",
174
  "lstrip": false,
175
  "normalized": false,
176
  "rstrip": false,
177
  "single_word": false,
178
  "special": true
179
  },
180
+ "42": {
181
+ "content": ":",
182
  "lstrip": false,
183
  "normalized": false,
184
  "rstrip": false,
185
  "single_word": false,
186
  "special": true
187
  },
188
+ "423": {
189
+ "content": "--",
190
  "lstrip": false,
191
  "normalized": false,
192
  "rstrip": false,
193
  "single_word": false,
194
  "special": true
195
  },
196
+ "1969": {
197
+ "content": "``",
198
  "lstrip": false,
199
  "normalized": false,
200
  "rstrip": false,
201
  "single_word": false,
202
  "special": true
203
  },
204
+ "2113": {
205
+ "content": "IN",
206
  "lstrip": false,
207
  "normalized": false,
208
  "rstrip": false,
209
  "single_word": false,
210
  "special": true
211
  },
212
+ "3816": {
213
+ "content": "''",
214
  "lstrip": false,
215
  "normalized": false,
216
  "rstrip": false,
217
  "single_word": false,
218
  "special": true
219
  },
220
+ "5137": {
221
+ "content": "CC",
222
  "lstrip": false,
223
  "normalized": false,
224
  "rstrip": false,
225
  "single_word": false,
226
  "special": true
227
  },
228
+ "6998": {
229
+ "content": "CD",
230
  "lstrip": false,
231
  "normalized": false,
232
  "rstrip": false,
233
  "single_word": false,
234
  "special": true
235
  },
236
+ "9324": {
237
+ "content": "EX",
238
  "lstrip": false,
239
  "normalized": false,
240
  "rstrip": false,
241
  "single_word": false,
242
  "special": true
243
  },
244
+ "9815": {
245
+ "content": "MD",
246
  "lstrip": false,
247
  "normalized": false,
248
  "rstrip": false,
249
  "single_word": false,
250
  "special": true
251
  },
252
+ "10179": {
253
+ "content": "TO",
254
  "lstrip": false,
255
  "normalized": false,
256
  "rstrip": false,
257
  "single_word": false,
258
  "special": true
259
  },
260
+ "12041": {
261
+ "content": "LS",
262
  "lstrip": false,
263
  "normalized": false,
264
  "rstrip": false,
265
  "single_word": false,
266
  "special": true
267
  },
268
+ "17679": {
269
+ "content": "DT",
270
  "lstrip": false,
271
  "normalized": false,
272
  "rstrip": false,
273
  "single_word": false,
274
  "special": true
275
  },
276
+ "19668": {
277
+ "content": "NN",
278
  "lstrip": false,
279
  "normalized": false,
280
  "rstrip": false,
281
  "single_word": false,
282
  "special": true
283
  },
284
+ "22259": {
285
+ "content": "POS",
286
  "lstrip": false,
287
  "normalized": false,
288
  "rstrip": false,
289
  "single_word": false,
290
  "special": true
291
  },
292
+ "29482": {
293
+ "content": "RP",
294
  "lstrip": false,
295
  "normalized": false,
296
  "rstrip": false,
297
  "single_word": false,
298
  "special": true
299
  },
300
+ "35991": {
301
+ "content": "FW",
302
  "lstrip": false,
303
  "normalized": false,
304
  "rstrip": false,
305
  "single_word": false,
306
  "special": true
307
  },
308
+ "45780": {
309
+ "content": "RB",
310
  "lstrip": false,
311
  "normalized": false,
312
  "rstrip": false,
313
  "single_word": false,
314
  "special": true
315
  },
316
+ "47306": {
317
+ "content": "WP",
318
  "lstrip": false,
319
  "normalized": false,
320
  "rstrip": false,
321
  "single_word": false,
322
  "special": true
323
  },
324
+ "49152": {
325
+ "content": "PRP$",
326
  "lstrip": false,
327
  "normalized": false,
328
  "rstrip": false,
329
  "single_word": false,
330
  "special": true
331
  },
332
+ "49153": {
333
+ "content": "VBG",
334
  "lstrip": false,
335
  "normalized": false,
336
  "rstrip": false,
337
  "single_word": false,
338
  "special": true
339
  },
340
+ "49154": {
341
+ "content": "VB",
342
  "lstrip": false,
343
  "normalized": false,
344
  "rstrip": false,
345
  "single_word": false,
346
  "special": true
347
  },
348
+ "49155": {
349
+ "content": "VBP",
350
  "lstrip": false,
351
  "normalized": false,
352
  "rstrip": false,
353
  "single_word": false,
354
  "special": true
355
  },
356
+ "49156": {
357
+ "content": "VBN",
358
  "lstrip": false,
359
  "normalized": false,
360
  "rstrip": false,
361
  "single_word": false,
362
  "special": true
363
  },
364
+ "49157": {
365
  "content": "JJ",
366
  "lstrip": false,
367
  "normalized": false,
 
369
  "single_word": false,
370
  "special": true
371
  },
372
+ "49158": {
373
+ "content": "VBZ",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
374
  "lstrip": false,
375
  "normalized": false,
376
  "rstrip": false,
377
  "single_word": false,
378
  "special": true
379
  },
380
+ "49159": {
381
+ "content": "RBR",
382
  "lstrip": false,
383
  "normalized": false,
384
  "rstrip": false,
385
  "single_word": false,
386
  "special": true
387
  },
388
+ "49160": {
389
+ "content": "VBD",
390
  "lstrip": false,
391
  "normalized": false,
392
  "rstrip": false,
393
  "single_word": false,
394
  "special": true
395
  },
396
+ "49161": {
397
+ "content": "NNS",
398
  "lstrip": false,
399
  "normalized": false,
400
  "rstrip": false,
401
  "single_word": false,
402
  "special": true
403
  },
404
+ "49162": {
405
+ "content": "NNP",
406
  "lstrip": false,
407
  "normalized": false,
408
  "rstrip": false,
409
  "single_word": false,
410
  "special": true
411
  },
412
+ "49163": {
413
+ "content": "WRB",
414
  "lstrip": false,
415
  "normalized": false,
416
  "rstrip": false,
417
  "single_word": false,
418
  "special": true
419
  },
420
+ "49164": {
421
+ "content": "PDT",
422
  "lstrip": false,
423
  "normalized": false,
424
  "rstrip": false,
425
  "single_word": false,
426
  "special": true
427
  },
428
+ "49165": {
429
  "content": "RBS",
430
  "lstrip": false,
431
  "normalized": false,
 
433
  "single_word": false,
434
  "special": true
435
  },
436
+ "49166": {
437
+ "content": "PRP",
438
  "lstrip": false,
439
  "normalized": false,
440
  "rstrip": false,
441
  "single_word": false,
442
  "special": true
443
  },
444
+ "49167": {
445
+ "content": "WP$",
446
  "lstrip": false,
447
  "normalized": false,
448
  "rstrip": false,
449
  "single_word": false,
450
  "special": true
451
  },
452
+ "49168": {
453
+ "content": "NNPS",
454
  "lstrip": false,
455
  "normalized": false,
456
  "rstrip": false,
457
  "single_word": false,
458
  "special": true
459
  },
460
+ "49169": {
461
+ "content": "JJS",
462
  "lstrip": false,
463
  "normalized": false,
464
  "rstrip": false,
465
  "single_word": false,
466
  "special": true
467
  },
468
+ "49170": {
469
+ "content": "JJR",
470
  "lstrip": false,
471
  "normalized": false,
472
  "rstrip": false,
473
  "single_word": false,
474
  "special": true
475
  },
476
+ "49171": {
477
+ "content": "SYM",
478
  "lstrip": false,
479
  "normalized": false,
480
  "rstrip": false,
481
  "single_word": false,
482
  "special": true
483
  },
484
+ "49172": {
485
+ "content": "UH",
486
  "lstrip": false,
487
  "normalized": false,
488
  "rstrip": false,
489
  "single_word": false,
490
  "special": true
491
  },
492
+ "49173": {
493
  "content": "WDT",
494
  "lstrip": false,
495
  "normalized": false,
496
  "rstrip": false,
497
  "single_word": false,
498
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
499
  }
500
  },
501
  "additional_special_tokens": [
502
+ "PRP$",
503
+ "VBG",
 
 
504
  "FW",
505
+ "VB",
506
+ "POS",
507
+ "''",
508
+ "VBP",
509
+ "VBN",
510
  "JJ",
511
+ "WP",
512
+ "VBZ",
513
+ "DT",
514
+ "RP",
515
+ "$",
516
  "NN",
517
+ ")",
518
+ "(",
519
+ "RBR",
520
+ "VBD",
521
+ ",",
522
+ ".",
523
+ "TO",
524
+ "LS",
525
+ "RB",
526
+ ":",
527
  "NNS",
528
  "NNP",
529
+ "``",
530
+ "WRB",
531
+ "CC",
532
  "PDT",
 
 
 
 
 
533
  "RBS",
534
+ "PRP",
535
+ "CD",
536
+ "EX",
537
+ "IN",
538
+ "WP$",
539
+ "MD",
540
+ "NNPS",
541
+ "--",
542
+ "JJS",
543
+ "JJR",
544
  "SYM",
 
545
  "UH",
546
+ "WDT"
 
 
 
 
 
 
 
 
 
547
  ],
548
  "bos_token": "<|endoftext|>",
549
  "clean_up_tokenization_spaces": false,