cd_tokenizer / tokenizer.json
tuanio's picture
Upload tokenizer
f5391bf
raw
history blame
20.9 kB
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[PAD]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[CLS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "[SEP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "[MASK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "BertNormalizer",
"clean_text": true,
"handle_chinese_chars": true,
"strip_accents": null,
"lowercase": true
},
"pre_tokenizer": {
"type": "BertPreTokenizer"
},
"post_processor": null,
"decoder": null,
"model": {
"type": "WordPiece",
"unk_token": "[UNK]",
"continuing_subword_prefix": "##",
"max_input_chars_per_word": 100,
"vocab": {
"[UNK]": 0,
"[PAD]": 1,
"[CLS]": 2,
"[SEP]": 3,
"[MASK]": 4,
"'": 5,
"(": 6,
")": 7,
"-": 8,
".": 9,
"8": 10,
"<": 11,
">": 12,
"@": 13,
"`": 14,
"a": 15,
"b": 16,
"c": 17,
"d": 18,
"e": 19,
"f": 20,
"g": 21,
"h": 22,
"i": 23,
"j": 24,
"k": 25,
"l": 26,
"m": 27,
"n": 28,
"o": 29,
"p": 30,
"q": 31,
"r": 32,
"s": 33,
"t": 34,
"u": 35,
"v": 36,
"w": 37,
"x": 38,
"y": 39,
"z": 40,
"##o": 41,
"##r": 42,
"##t": 43,
"##h": 44,
"##e": 45,
"##l": 46,
"##y": 47,
"##u": 48,
"##f": 49,
"##a": 50,
"##c": 51,
"##i": 52,
"##n": 53,
"##g": 54,
"##s": 55,
"##k": 56,
"##d": 57,
"##b": 58,
"##m": 59,
"##p": 60,
"##w": 61,
"##v": 62,
"##j": 63,
"##x": 64,
"##q": 65,
"##z": 66,
"##8": 67,
"##er": 68,
"##re": 69,
"##ve": 70,
"##in": 71,
"on": 72,
"one": 73,
"th": 74,
"##ree": 75,
"##wo": 76,
"two": 77,
"##ero": 78,
"zero": 79,
"three": 80,
"##ar": 81,
"##ig": 82,
"##igh": 83,
"##ou": 84,
"##ight": 85,
"##nd": 86,
"##eve": 87,
"fi": 88,
"si": 89,
"eight": 90,
"five": 91,
"nin": 92,
"##ay": 93,
"six": 94,
"##even": 95,
"seven": 96,
"##our": 97,
"de": 98,
"##ing": 99,
"four": 100,
"niner": 101,
"##al": 102,
"##le": 103,
"##mb": 104,
"##or": 105,
"##ky": 106,
"sky": 107,
"##ark": 108,
"skyl": 109,
"skylark": 110,
"##ac": 111,
"##nw": 112,
"to": 113,
"##on": 114,
"leve": 115,
"level": 116,
"##ur": 117,
"re": 118,
"##ad": 119,
"er": 120,
"go": 121,
"##and": 122,
"ru": 123,
"##nway": 124,
"runway": 125,
"##li": 126,
"##red": 127,
"##ap": 128,
"##ro": 129,
"##sc": 130,
"##an": 131,
"##ir": 132,
"##ind": 133,
"##mber": 134,
"##ue": 135,
"fl": 136,
"cle": 137,
"##et": 138,
"##tar": 139,
"con": 140,
"flight": 141,
"and": 142,
"##ared": 143,
"##ct": 144,
"cleared": 145,
"se": 146,
"##ke": 147,
"for": 148,
"##is": 149,
"##ch": 150,
"##ot": 151,
"cli": 152,
"climb": 153,
"no": 154,
"##end": 155,
"##im": 156,
"thou": 157,
"thous": 158,
"thousand": 159,
"##um": 160,
"##ed": 161,
"##scend": 162,
"descend": 163,
"##ain": 164,
"sele": 165,
"##at": 166,
"seletar": 167,
"##ump": 168,
"lump": 169,
"lumpur": 170,
"cont": 171,
"##od": 172,
"good": 173,
"##ra": 174,
"dec": 175,
"##imal": 176,
"decimal": 177,
"##pp": 178,
"app": 179,
"appro": 180,
"nove": 181,
"november": 182,
"##st": 183,
"##ow": 184,
"fe": 185,
"feet": 186,
"vi": 187,
"mal": 188,
"cap": 189,
"rad": 190,
"tow": 191,
"##act": 192,
"tower": 193,
"##ead": 194,
"radar": 195,
"##si": 196,
"nine": 197,
"nu": 198,
"number": 199,
"contact": 200,
"red": 201,
"##ff": 202,
"##be": 203,
"##ace": 204,
"sur": 205,
"surf": 206,
"surface": 207,
"wind": 208,
"##not": 209,
"##ly": 210,
"knot": 211,
"knots": 212,
"##gree": 213,
"##xt": 214,
"##op": 215,
"##scar": 216,
"oscar": 217,
"##it": 218,
"##ft": 219,
"degree": 220,
"degrees": 221,
"fo": 222,
"ech": 223,
"echo": 224,
"##ied": 225,
"que": 226,
"##bec": 227,
"quebec": 228,
"dow": 229,
"##rot": 230,
"##rect": 231,
"foxt": 232,
"foxtrot": 233,
"land": 234,
"##ctor": 235,
"le": 236,
"##nwind": 237,
"downwind": 238,
"victor": 239,
"##il": 240,
"##ore": 241,
"##en": 242,
"##que": 243,
"head": 244,
"heading": 245,
"sing": 246,
"##vo": 247,
"singap": 248,
"singapore": 249,
"di": 250,
"bra": 251,
"bravo": 252,
"##fly": 253,
"##ss": 254,
"reque": 255,
"request": 256,
"direct": 257,
"##ning": 258,
"wh": 259,
"##irm": 260,
"##und": 261,
"hund": 262,
"hundred": 263,
"fire": 264,
"firefly": 265,
"##ach": 266,
"tou": 267,
"touch": 268,
"pap": 269,
"papa": 270,
"##ate": 271,
"malay": 272,
"malaysi": 273,
"main": 274,
"##tain": 275,
"maintain": 276,
"you": 277,
"st": 278,
"day": 279,
"approach": 280,
"fin": 281,
"final": 282,
"##ved": 283,
"approved": 284,
"air": 285,
"cop": 286,
"##ta": 287,
"malaysian": 288,
"sil": 289,
"ta": 290,
"##ti": 291,
"mi": 292,
"##ang": 293,
"we": 294,
"ch": 295,
"##ka": 296,
"##ound": 297,
"by": 298,
"silka": 299,
"silkair": 300,
"copied": 301,
"al": 302,
"the": 303,
"vis": 304,
"##oin": 305,
"left": 306,
"ro": 307,
"whis": 308,
"take": 309,
"##key": 310,
"whiskey": 311,
"kil": 312,
"kilo": 313,
"##round": 314,
"##off": 315,
"takeoff": 316,
"##ort": 317,
"##ual": 318,
"##ld": 319,
"pa": 320,
"in": 321,
"##co": 322,
"vac": 323,
"up": 324,
"visual": 325,
"mike": 326,
"eve": 327,
"evening": 328,
"at": 329,
"is": 330,
"##indo": 331,
"vacate": 332,
"del": 333,
"ho": 334,
"##ic": 335,
"tur": 336,
"pass": 337,
"##firm": 338,
"confirm": 339,
"##qu": 340,
"hold": 341,
"read": 342,
"delta": 343,
"##arli": 344,
"##arlie": 345,
"charlie": 346,
"##el": 347,
"malindo": 348,
"##ia": 349,
"##ack": 350,
"sco": 351,
"scoot": 352,
"scooter": 353,
"ind": 354,
"ready": 355,
"##aw": 356,
"##ger": 357,
"wa": 358,
"ray": 359,
"##enti": 360,
"passing": 361,
"tang": 362,
"tango": 363,
"ground": 364,
"##awk": 365,
"##quawk": 366,
"squawk": 367,
"india": 368,
"id": 369,
"jet": 370,
"##fter": 371,
"##ank": 372,
"##ine": 373,
"##fied": 374,
"identi": 375,
"identified": 376,
"roger": 377,
"after": 378,
"ju": 379,
"##ph": 380,
"alph": 381,
"alpha": 382,
"line": 383,
"##ffirm": 384,
"affirm": 385,
"##erra": 386,
"sierra": 387,
"##ion": 388,
"##port": 389,
"##ver": 390,
"report": 391,
"##am": 392,
"clear": 393,
"##gon": 394,
"wagon": 395,
"of": 396,
"contin": 397,
"over": 398,
"continue": 399,
"sir": 400,
"turn": 401,
"##lf": 402,
"ex": 403,
"golf": 404,
"##ett": 405,
"##ri": 406,
"bye": 407,
"juli": 408,
"thank": 409,
"##ith": 410,
"juliett": 411,
"with": 412,
"climbing": 413,
"##ul": 414,
"via": 415,
"tr": 416,
"right": 417,
"##cu": 418,
"are": 419,
"##head": 420,
"##av": 421,
"##her": 422,
"exp": 423,
"now": 424,
"cir": 425,
"##cuit": 426,
"circuit": 427,
"##art": 428,
"low": 429,
"##ima": 430,
"hot": 431,
"mor": 432,
"dep": 433,
"hotel": 434,
"morning": 435,
"depart": 436,
"stop": 437,
"##sia": 438,
"##oint": 439,
"point": 440,
"lima": 441,
"##de": 442,
"jets": 443,
"jetstar": 444,
"ad": 445,
"ar": 446,
"bo": 447,
"##ave": 448,
"##kee": 449,
"yan": 450,
"yankee": 451,
"asia": 452,
"when": 453,
"stand": 454,
"##no": 455,
"##val": 456,
"join": 457,
"arri": 458,
"arrival": 459,
"be": 460,
"wil": 461,
"adv": 462,
"overhead": 463,
"su": 464,
"fly": 465,
"##atar": 466,
"bob": 467,
"afterno": 468,
"afternoon": 469,
"advis": 470,
"##ffic": 471,
"tra": 472,
"traffic": 473,
"ma": 474,
"holding": 475,
"gli": 476,
"glide": 477,
"##aving": 478,
"high": 479,
"leaving": 480,
"##rol": 481,
"control": 482,
"bobis": 483,
"##ce": 484,
"track": 485,
"due": 486,
"ag": 487,
"your": 488,
"med": 489,
"again": 490,
"##ure": 491,
"medic": 492,
"have": 493,
"ne": 494,
"##me": 495,
"##orm": 496,
"qatar": 497,
"will": 498,
"qatari": 499,
"##ma": 500,
"##ather": 501,
"weather": 502,
"sel": 503,
"##amat": 504,
"selamat": 505,
"ful": 506,
"cal": 507,
"##les": 508,
"##ive": 509,
"this": 510,
"full": 511,
"##ak": 512,
"##one": 513,
"rome": 514,
"romeo": 515,
"##oh": 516,
"##ates": 517,
"sp": 518,
"##ress": 519,
"em": 520,
"joh": 521,
"emir": 522,
"johor": 523,
"cor": 524,
"emirates": 525,
"correct": 526,
"express": 527,
"##ative": 528,
"##ation": 529,
"##aya": 530,
"spe": 531,
"correction": 532,
"##bar": 533,
"speed": 534,
"departure": 535,
"paya": 536,
"lebar": 537,
"advised": 538,
"ara": 539,
"##ep": 540,
"arama": 541,
"zone": 542,
"say": 543,
"##ax": 544,
"##ble": 545,
"neg": 546,
"negative": 547,
"back": 548,
"miles": 549,
"sal": 550,
"##bek": 551,
"##tion": 552,
"nobek": 553,
"am": 554,
"within": 555,
"##ulu": 556,
"co": 557,
"salax": 558,
"ke": 559,
"zulu": 560,
"around": 561,
"keep": 562,
"##sition": 563,
"suk": 564,
"can": 565,
"##ect": 566,
"descending": 567,
"##ial": 568,
"##ross": 569,
"xan": 570,
"##adu": 571,
"xanadu": 572,
"cross": 573,
"##se": 574,
"he": 575,
"all": 576,
"nort": 577,
"##te": 578,
"##un": 579,
"call": 580,
"##tr": 581,
"##track": 582,
"backtrack": 583,
"un": 584,
"maintaining": 585,
"##ast": 586,
"approaching": 587,
"ok": 588,
"po": 589,
"area": 590,
"norm": 591,
"##way": 592,
"normal": 593,
"init": 594,
"initial": 595,
"##night": 596,
"##form": 597,
"advise": 598,
"expect": 599,
"ob": 600,
"goodnight": 601,
"op": 602,
"sh": 603,
"sw": 604,
"that": 605,
"##ter": 606,
"squawking": 607,
"##itch": 608,
"turns": 609,
"##go": 610,
"##ck": 611,
"##quen": 612,
"malam": 613,
"##ent": 614,
"##ase": 615,
"fro": 616,
"fan": 617,
"tran": 618,
"from": 619,
"##stop": 620,
"fanstop": 621,
"malaysia": 622,
"##up": 623,
"initially": 624,
"position": 625,
"top": 626,
"copy": 627,
"##ird": 628,
"##ankan": 629,
"gup": 630,
"uh": 631,
"away": 632,
"gupta": 633,
"departing": 634,
"ah": 635,
"ops": 636,
"com": 637,
"going": 638,
"turning": 639,
"switch": 640,
"wet": 641,
"##bird": 642,
"speedbird": 643,
"##ian": 644,
"##bound": 645,
"che": 646,
"##ers": 647,
"west": 648,
"##quenc": 649,
"check": 650,
"requesting": 651,
"##ril": 652,
"sril": 653,
"##ish": 654,
"srilankan": 655,
"jak": 656,
"##th": 657,
"##arta": 658,
"jakarta": 659,
"tax": 660,
"ear": 661,
"##ru": 662,
"early": 663,
"fre": 664,
"taxi": 665,
"late": 666,
"##cal": 667,
"##quency": 668,
"frequency": 669,
"north": 670,
"airf": 671,
"vista": 672,
"rec": 673,
"an": 674,
"car": 675,
"met": 676,
"sukri": 677,
"ahead": 678,
"##ut": 679,
"sub": 680,
"##gi": 681,
"est": 682,
"cargo": 683,
"or": 684,
"obst": 685,
"##ab": 686,
"##etang": 687,
"pro": 688,
"petang": 689,
"##ance": 690,
"ext": 691,
"subang": 692,
"##id": 693,
"##ong": 694,
"do": 695,
"hand": 696,
"topor": 697,
"obstac": 698,
"##au": 699,
"extend": 700,
"eti": 701,
"##had": 702,
"etihad": 703,
"##us": 704,
"##ako": 705,
"og": 706,
"ogako": 707,
"meters": 708,
"sukat": 709,
"east": 710,
"##bo": 711,
"height": 712,
"short": 713,
"wait": 714,
"##ere": 715,
"switching": 716,
"able": 717,
"landing": 718,
"##per": 719,
"higher": 720,
"##ruct": 721,
"##onian": 722,
"##edonian": 723,
"caledonian": 724,
"ifly": 725,
"##sh": 726,
"##ther": 727,
"make": 728,
"just": 729,
"wilco": 730,
"##men": 731,
"chang": 732,
"sight": 733,
"##ath": 734,
"recle": 735,
"turk": 736,
"airb": 737,
"turkish": 738,
"recleared": 739,
"super": 740,
"pagi": 741,
"commen": 742,
"##ite": 743,
"exped": 744,
"inform": 745,
"requ": 746,
"requi": 747,
"##ie": 748,
"expedite": 749,
"batar": 750,
"##fa": 751,
"alfa": 752,
"there": 753,
"change": 754,
"information": 755,
"rej": 756,
"rejoin": 757,
"##able": 758,
"##ime": 759,
"##mas": 760,
"ge": 761,
"##os": 762,
"gemas": 763,
"##pt": 764,
"##ne": 765,
"##green": 766,
"supergreen": 767,
"min": 768,
"struct": 769,
"##ceed": 770,
"structure": 771,
"##em": 772,
"ac": 773,
"##ans": 774,
"fur": 775,
"ste": 776,
"any": 777,
"proceed": 778,
"further": 779,
"tracking": 780,
"transition": 781,
"uni": 782,
"uniform": 783,
"steel": 784,
"##ag": 785,
"obstacles": 786,
"norther": 787,
"northern": 788,
"##lo": 789,
"it": 790,
"##ard": 791,
"joining": 792,
"##va": 793,
"lo": 794,
"base": 795,
"time": 796,
"##orne": 797,
"airborne": 798,
"##ise": 799,
"##ren": 800,
"##auti": 801,
"nauti": 802,
"nautical": 803,
"star": 804,
"##ty": 805,
"raya": 806,
"path": 807,
"##aple": 808,
"eli": 809,
"##wind": 810,
"lower": 811,
"elite": 812,
"rn": 813,
"##ai": 814,
"kid": 815,
"rnav": 816,
"kidot": 817,
"lark": 818,
"ll": 819,
"##lish": 820,
"estab": 821,
"establish": 822,
"inter": 823,
"extended": 824,
"##une": 825,
"flaple": 826,
"flapless": 827,
"eastindo": 828,
"rem": 829,
"gra": 830,
"remain": 831,
"##ute": 832,
"commence": 833,
"##lease": 834,
"required": 835,
"grass": 836,
"minute": 837,
"thanks": 838,
"trans": 839,
"##ract": 840,
"pre": 841,
"##ry": 842,
"down": 843,
"##ield": 844,
"shortly": 845,
"pract": 846,
"##ux": 847,
"inbound": 848,
"airfield": 849,
"please": 850,
"##raft": 851,
"then": 852,
"bel": 853,
"cre": 854,
"##craft": 855,
"as": 856,
"eighty": 857,
"silo": 858,
"##ip": 859,
"station": 860,
"calling": 861,
"aircraft": 862,
"if": 863,
"##cel": 864,
"leave": 865,
"structures": 866,
"##orry": 867,
"crew": 868,
"li": 869,
"sorry": 870,
"rou": 871,
"int": 872,
"##rans": 873,
"off": 874,
"nept": 875,
"neptune": 876,
"practise": 877,
"airfrans": 878,
"lion": 879,
"##es": 880,
"##lux": 881,
"cargolux": 882,
"##ok": 883,
"##vail": 884,
"##by": 885,
"avail": 886,
"alt": 887,
"available": 888,
"bat": 889,
"##ond": 890,
"how": 891,
"##ude": 892,
"##itude": 893,
"altitude": 894,
"mit": 895,
"mitos": 896,
"##ting": 897,
"kong": 898,
"hel": 899,
"upwind": 900,
"clearance": 901,
"westbound": 902,
"route": 903,
"reach": 904,
"##adem": 905,
"academ": 906,
"bobag": 907,
"academy": 908,
"##ill": 909,
"##vi": 910,
"set": 911,
"pres": 912,
"present": 913,
"nave": 914,
"##enger": 915,
"navex": 916,
"ab": 917,
"passenger": 918,
"##avy": 919,
"minutes": 920,
"excel": 921,
"heavy": 922,
"bay": 923,
"##io": 924,
"##so": 925,
"out": 926,
"ver": 927,
"##urren": 928,
"##urrent": 929,
"current": 930,
"devi": 931,
"like": 932,
"##ould": 933,
"##alam": 934,
"radio": 935,
"sou": 936,
"##kum": 937,
"##reak": 938,
"break": 939,
"mu": 940,
"##out": 941,
"heli": 942,
"unk": 943,
"##ment": 944,
"below": 945,
"##ex": 946,
"res": 947,
"fed": 948,
"ten": 949,
"about": 950,
"very": 951,
"gun": 952,
"still": 953,
"estim": 954,
"##bye": 955,
"fedex": 956,
"goodbye": 957,
"ass": 958,
"transm": 959,
"##ating": 960,
"stra": 961,
"##ik": 962,
"straight": 963,
"##ll": 964,
"standing": 965,
"hello": 966,
"local": 967,
"##ah": 968,
"so": 969,
"diam": 970,
"##ention": 971,
"diamond": 972,
"##erm": 973,
"mas": 974,
"##ung": 975,
"intention": 976,
"reaching": 977,
"see": 978,
"bang": 979,
"##ume": 980,
"##ice": 981,
"currently": 982,
"nav": 983,
"##after": 984,
"thereafter": 985,
"resume": 986,
"masbo": 987,
"light": 988,
"navig": 989,
"sab": 990,
"deviation": 991,
"not": 992,
"navigation": 993,
"fir": 994,
"batik": 995,
"estimate": 996,
"aro": 997,
"##aining": 998,
"pl": 999
}
}
}