Safetensors
Japanese
bert
japanese-splade-base-v1 / tokenizer.json
hotchpotch's picture
Update tokenizer.json
4939a3f verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "[PAD]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[CLS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "[SEP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "[MASK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "Whitespace"
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
}
],
"pair": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
}
],
"special_tokens": {
"[CLS]": {
"id": "[CLS]",
"ids": [
2
],
"tokens": [
"[CLS]"
]
},
"[SEP]": {
"id": "[SEP]",
"ids": [
3
],
"tokens": [
"[SEP]"
]
}
}
},
"decoder": {
"type": "BPEDecoder",
"suffix": "</w>"
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": null,
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"ignore_merges": false,
"vocab": {
"[PAD]": 0,
"[UNK]": 1,
"[CLS]": 2,
"[SEP]": 3,
"[MASK]": 4,
"!": 5,
"\"": 6,
"%": 7,
"&": 8,
"'": 9,
"(": 10,
")": 11,
"*": 12,
"+": 13,
",": 14,
"-": 15,
".": 16,
"/": 17,
"0": 18,
"1": 19,
"2": 20,
"3": 21,
"4": 22,
"5": 23,
"6": 24,
"7": 25,
"8": 26,
"9": 27,
":": 28,
";": 29,
"?": 30,
"A": 31,
"B": 32,
"C": 33,
"D": 34,
"E": 35,
"F": 36,
"G": 37,
"H": 38,
"I": 39,
"J": 40,
"K": 41,
"L": 42,
"M": 43,
"N": 44,
"O": 45,
"P": 46,
"Q": 47,
"R": 48,
"S": 49,
"T": 50,
"U": 51,
"V": 52,
"W": 53,
"X": 54,
"Y": 55,
"Z": 56,
"[": 57,
"]": 58,
"_": 59,
"a": 60,
"b": 61,
"c": 62,
"d": 63,
"e": 64,
"f": 65,
"g": 66,
"h": 67,
"i": 68,
"j": 69,
"k": 70,
"l": 71,
"m": 72,
"n": 73,
"o": 74,
"p": 75,
"q": 76,
"r": 77,
"s": 78,
"t": 79,
"u": 80,
"v": 81,
"w": 82,
"x": 83,
"y": 84,
"z": 85,
"|": 86,
"§": 87,
"Á": 88,
"Æ": 89,
"á": 90,
"æ": 91,
"ç": 92,
"è": 93,
"é": 94,
"í": 95,
"ð": 96,
"ö": 97,
"ú": 98,
"ü": 99,
"þ": 100,
"ā": 101,
"ē": 102,
"ŋ": 103,
"ƿ": 104,
"ɑ": 105,
"ɒ": 106,
"ɔ": 107,
"ɖ": 108,
"ə": 109,
"ɚ": 110,
"ɛ": 111,
"ɜ": 112,
"ɡ": 113,
"ɪ": 114,
"ɫ": 115,
"ɹ": 116,
"ɾ": 117,
"ʃ": 118,
"ʈ": 119,
"ʊ": 120,
"ʌ": 121,
"ʍ": 122,
"ʒ": 123,
"ʔ": 124,
"ʰ": 125,
"ʱ": 126,
"ʲ": 127,
"ʷ": 128,
"ˈ": 129,
"ː": 130,
"ˑ": 131,
"̚": 132,
"̥": 133,
"̩": 134,
"̪": 135,
"̯": 136,
"͡": 137,
"θ": 138,
"‑": 139,
"–": 140,
"—": 141,
"∅": 142,
"⟨": 143,
"⟩": 144,
"an": 145,
"th": 146,
"in": 147,
"on": 148,
"er": 149,
"is": 150,
"es": 151,
"or": 152,
"the": 153,
"ti": 154,
"ar": 155,
"al": 156,
"en": 157,
"ed": 158,
"of": 159,
"and": 160,
"gl": 161,
"ish": 162,
"ngl": 163,
"Engl": 164,
"English": 165,
"as": 166,
"ic": 167,
"ou": 168,
"20": 169,
"tion": 170,
"ing": 171,
"ec": 172,
"om": 173,
"at": 174,
"st": 175,
"it": 176,
"le": 177,
"ge": 178,
"re": 179,
"gu": 180,
"angu": 181,
"angua": 182,
"ch": 183,
"ent": 184,
"ve": 185,
"to": 186,
").": 187,
"ation": 188,
"ri": 189,
"ly": 190,
"am": 191,
"oun": 192,
"ers": 193,
"anguage": 194,
"for": 195,
"fr": 196,
"ll": 197,
"us": 198,
"200": 199,
"he": 200,
"tic": 201,
"pr": 202,
"di": 203,
"ow": 204,
"et": 205,
"ig": 206,
"19": 207,
"pe": 208,
"ac": 209,
".[": 210,
"ur": 211,
"wi": 212,
"201": 213,
"ect": 214,
"iv": 215,
"ess": 216,
"The": 217,
"ol": 218,
"ter": 219,
"de": 220,
"language": 221,
"wor": 222,
"from": 223,
"un": 224,
"In": 225,
"ver": 226,
"ir": 227,
"are": 228,
"cl": 229,
"ther": 230,
"ad": 231,
"man": 232,
"con": 233,
"ab": 234,
"ex": 235,
"with": 236,
"pp": 237,
"wh": 238,
"el": 239,
"97": 240,
"ary": 241,
"10": 242,
"su": 243,
"ph": 244,
"ul": 245,
"po": 246,
"978": 247,
"ld": 248,
"ak": 249,
"si": 250,
"ru": 251,
"tive": 252,
"ds": 253,
"oc": 254,
"enc": 255
},
"merges": [
"a n",
"t h",
"i n",
"o n",
"e r",
"i s",
"e s",
"o r",
"th e",
"t i",
"a r",
"a l",
"e n",
"e d",
"o f",
"an d",
"g l",
"is h",
"n gl",
"E ngl",
"Engl ish",
"a s",
"i c",
"o u",
"2 0",
"ti on",
"in g",
"e c",
"o m",
"a t",
"s t",
"i t",
"l e",
"g e",
"r e",
"g u",
"an gu",
"angu a",
"c h",
"en t",
"v e",
"t o",
") .",
"a tion",
"r i",
"l y",
"a m",
"ou n",
"er s",
"angua ge",
"f or",
"f r",
"l l",
"u s",
"20 0",
"h e",
"ti c",
"p r",
"d i",
"o w",
"e t",
"i g",
"1 9",
"p e",
"a c",
". [",
"u r",
"w i",
"20 1",
"ec t",
"i v",
"es s",
"T he",
"o l",
"t er",
"d e",
"l anguage",
"w or",
"fr om",
"u n",
"I n",
"v er",
"i r",
"ar e",
"c l",
"th er",
"a d",
"m an",
"c on",
"a b",
"e x",
"wi th",
"p p",
"w h",
"e l",
"9 7",
"ar y",
"1 0",
"s u",
"p h",
"u l",
"p o",
"97 8",
"l d",
"a k",
"s i",
"r u",
"ti ve",
"d s",
"o c",
"en c"
]
}
}