japanese-splade-base-v1 / tokenizer.json

Update tokenizer.json

4939a3f verified 4 months ago

8.42 kB

	{
	"version": "1.0",
	"truncation": null,
	"padding": null,
	"added_tokens": [
	{
	"id": 0,
	"content": "[PAD]",
	"single_word": false,
	"lstrip": false,
	"rstrip": false,
	"normalized": false,
	"special": true
	},
	{
	"id": 1,
	"content": "[UNK]",
	"single_word": false,
	"lstrip": false,
	"rstrip": false,
	"normalized": false,
	"special": true
	},
	{
	"id": 2,
	"content": "[CLS]",
	"single_word": false,
	"lstrip": false,
	"rstrip": false,
	"normalized": false,
	"special": true
	},
	{
	"id": 3,
	"content": "[SEP]",
	"single_word": false,
	"lstrip": false,
	"rstrip": false,
	"normalized": false,
	"special": true
	},
	{
	"id": 4,
	"content": "[MASK]",
	"single_word": false,
	"lstrip": false,
	"rstrip": false,
	"normalized": false,
	"special": true
	}
	],
	"normalizer": null,
	"pre_tokenizer": {
	"type": "Whitespace"
	},
	"post_processor": {
	"type": "TemplateProcessing",
	"single": [
	{
	"SpecialToken": {
	"id": "[CLS]",
	"type_id": 0
	}
	},
	{
	"Sequence": {
	"id": "A",
	"type_id": 0
	}
	},
	{
	"SpecialToken": {
	"id": "[SEP]",
	"type_id": 0
	}
	}
	],
	"pair": [
	{
	"SpecialToken": {
	"id": "[CLS]",
	"type_id": 0
	}
	},
	{
	"Sequence": {
	"id": "A",
	"type_id": 0
	}
	},
	{
	"SpecialToken": {
	"id": "[SEP]",
	"type_id": 0
	}
	},
	{
	"SpecialToken": {
	"id": "[CLS]",
	"type_id": 0
	}
	},
	{
	"Sequence": {
	"id": "B",
	"type_id": 0
	}
	},
	{
	"SpecialToken": {
	"id": "[SEP]",
	"type_id": 0
	}
	}
	],
	"special_tokens": {
	"[CLS]": {
	"id": "[CLS]",
	"ids": [
	2
	],
	"tokens": [
	"[CLS]"
	]
	},
	"[SEP]": {
	"id": "[SEP]",
	"ids": [
	3
	],
	"tokens": [
	"[SEP]"
	]
	}
	}
	},
	"decoder": {
	"type": "BPEDecoder",
	"suffix": "</w>"
	},
	"model": {
	"type": "BPE",
	"dropout": null,
	"unk_token": null,
	"continuing_subword_prefix": null,
	"end_of_word_suffix": null,
	"fuse_unk": false,
	"byte_fallback": false,
	"ignore_merges": false,
	"vocab": {
	"[PAD]": 0,
	"[UNK]": 1,
	"[CLS]": 2,
	"[SEP]": 3,
	"[MASK]": 4,
	"!": 5,
	"\"": 6,
	"%": 7,
	"&": 8,
	"'": 9,
	"(": 10,
	")": 11,
	"*": 12,
	"+": 13,
	",": 14,
	"-": 15,
	".": 16,
	"/": 17,
	"0": 18,
	"1": 19,
	"2": 20,
	"3": 21,
	"4": 22,
	"5": 23,
	"6": 24,
	"7": 25,
	"8": 26,
	"9": 27,
	":": 28,
	";": 29,
	"?": 30,
	"A": 31,
	"B": 32,
	"C": 33,
	"D": 34,
	"E": 35,
	"F": 36,
	"G": 37,
	"H": 38,
	"I": 39,
	"J": 40,
	"K": 41,
	"L": 42,
	"M": 43,
	"N": 44,
	"O": 45,
	"P": 46,
	"Q": 47,
	"R": 48,
	"S": 49,
	"T": 50,
	"U": 51,
	"V": 52,
	"W": 53,
	"X": 54,
	"Y": 55,
	"Z": 56,
	"[": 57,
	"]": 58,
	"_": 59,
	"a": 60,
	"b": 61,
	"c": 62,
	"d": 63,
	"e": 64,
	"f": 65,
	"g": 66,
	"h": 67,
	"i": 68,
	"j": 69,
	"k": 70,
	"l": 71,
	"m": 72,
	"n": 73,
	"o": 74,
	"p": 75,
	"q": 76,
	"r": 77,
	"s": 78,
	"t": 79,
	"u": 80,
	"v": 81,
	"w": 82,
	"x": 83,
	"y": 84,
	"z": 85,
	"\|": 86,
	"§": 87,
	"Á": 88,
	"Æ": 89,
	"á": 90,
	"æ": 91,
	"ç": 92,
	"è": 93,
	"é": 94,
	"í": 95,
	"ð": 96,
	"ö": 97,
	"ú": 98,
	"ü": 99,
	"þ": 100,
	"ā": 101,
	"ē": 102,
	"ŋ": 103,
	"ƿ": 104,
	"ɑ": 105,
	"ɒ": 106,
	"ɔ": 107,
	"ɖ": 108,
	"ə": 109,
	"ɚ": 110,
	"ɛ": 111,
	"ɜ": 112,
	"ɡ": 113,
	"ɪ": 114,
	"ɫ": 115,
	"ɹ": 116,
	"ɾ": 117,
	"ʃ": 118,
	"ʈ": 119,
	"ʊ": 120,
	"ʌ": 121,
	"ʍ": 122,
	"ʒ": 123,
	"ʔ": 124,
	"ʰ": 125,
	"ʱ": 126,
	"ʲ": 127,
	"ʷ": 128,
	"ˈ": 129,
	"ː": 130,
	"ˑ": 131,
	"̚": 132,
	"̥": 133,
	"̩": 134,
	"̪": 135,
	"̯": 136,
	"͡": 137,
	"θ": 138,
	"‑": 139,
	"–": 140,
	"—": 141,
	"∅": 142,
	"⟨": 143,
	"⟩": 144,
	"an": 145,
	"th": 146,
	"in": 147,
	"on": 148,
	"er": 149,
	"is": 150,
	"es": 151,
	"or": 152,
	"the": 153,
	"ti": 154,
	"ar": 155,
	"al": 156,
	"en": 157,
	"ed": 158,
	"of": 159,
	"and": 160,
	"gl": 161,
	"ish": 162,
	"ngl": 163,
	"Engl": 164,
	"English": 165,
	"as": 166,
	"ic": 167,
	"ou": 168,
	"20": 169,
	"tion": 170,
	"ing": 171,
	"ec": 172,
	"om": 173,
	"at": 174,
	"st": 175,
	"it": 176,
	"le": 177,
	"ge": 178,
	"re": 179,
	"gu": 180,
	"angu": 181,
	"angua": 182,
	"ch": 183,
	"ent": 184,
	"ve": 185,
	"to": 186,
	").": 187,
	"ation": 188,
	"ri": 189,
	"ly": 190,
	"am": 191,
	"oun": 192,
	"ers": 193,
	"anguage": 194,
	"for": 195,
	"fr": 196,
	"ll": 197,
	"us": 198,
	"200": 199,
	"he": 200,
	"tic": 201,
	"pr": 202,
	"di": 203,
	"ow": 204,
	"et": 205,
	"ig": 206,
	"19": 207,
	"pe": 208,
	"ac": 209,
	".[": 210,
	"ur": 211,
	"wi": 212,
	"201": 213,
	"ect": 214,
	"iv": 215,
	"ess": 216,
	"The": 217,
	"ol": 218,
	"ter": 219,
	"de": 220,
	"language": 221,
	"wor": 222,
	"from": 223,
	"un": 224,
	"In": 225,
	"ver": 226,
	"ir": 227,
	"are": 228,
	"cl": 229,
	"ther": 230,
	"ad": 231,
	"man": 232,
	"con": 233,
	"ab": 234,
	"ex": 235,
	"with": 236,
	"pp": 237,
	"wh": 238,
	"el": 239,
	"97": 240,
	"ary": 241,
	"10": 242,
	"su": 243,
	"ph": 244,
	"ul": 245,
	"po": 246,
	"978": 247,
	"ld": 248,
	"ak": 249,
	"si": 250,
	"ru": 251,
	"tive": 252,
	"ds": 253,
	"oc": 254,
	"enc": 255
	},
	"merges": [
	"a n",
	"t h",
	"i n",
	"o n",
	"e r",
	"i s",
	"e s",
	"o r",
	"th e",
	"t i",
	"a r",
	"a l",
	"e n",
	"e d",
	"o f",
	"an d",
	"g l",
	"is h",
	"n gl",
	"E ngl",
	"Engl ish",
	"a s",
	"i c",
	"o u",
	"2 0",
	"ti on",
	"in g",
	"e c",
	"o m",
	"a t",
	"s t",
	"i t",
	"l e",
	"g e",
	"r e",
	"g u",
	"an gu",
	"angu a",
	"c h",
	"en t",
	"v e",
	"t o",
	") .",
	"a tion",
	"r i",
	"l y",
	"a m",
	"ou n",
	"er s",
	"angua ge",
	"f or",
	"f r",
	"l l",
	"u s",
	"20 0",
	"h e",
	"ti c",
	"p r",
	"d i",
	"o w",
	"e t",
	"i g",
	"1 9",
	"p e",
	"a c",
	". [",
	"u r",
	"w i",
	"20 1",
	"ec t",
	"i v",
	"es s",
	"T he",
	"o l",
	"t er",
	"d e",
	"l anguage",
	"w or",
	"fr om",
	"u n",
	"I n",
	"v er",
	"i r",
	"ar e",
	"c l",
	"th er",
	"a d",
	"m an",
	"c on",
	"a b",
	"e x",
	"wi th",
	"p p",
	"w h",
	"e l",
	"9 7",
	"ar y",
	"1 0",
	"s u",
	"p h",
	"u l",
	"p o",
	"97 8",
	"l d",
	"a k",
	"s i",
	"r u",
	"ti ve",
	"d s",
	"o c",
	"en c"
	]
	}
	}