{ "version": "1.0", "truncation": null, "padding": null, "added_tokens": [ { "id": 0, "content": "[CLS]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 1, "content": "[SEP]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 2, "content": "[PAD]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 3, "content": "[UNK]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 4, "content": "[MASK]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 5, "content": "[ILLEGAL]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 6, "content": "~", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true } ], "normalizer": { "type": "Sequence", "normalizers": [ { "type": "NFD" }, { "type": "StripAccents" } ] }, "pre_tokenizer": { "type": "Split", "pattern": { "String": "" }, "behavior": "Isolated", "invert": false }, "post_processor": { "type": "RobertaProcessing", "sep": [ "[SEP]", 1 ], "cls": [ "[CLS]", 0 ], "trim_offsets": false, "add_prefix_space": false }, "decoder": { "type": "WordPiece", "prefix": "##", "cleanup": true }, "model": { "type": "WordPiece", "unk_token": "[UNK]", "continuing_subword_prefix": "##", "max_input_chars_per_word": 100, "vocab": { "[CLS]": 0, "[SEP]": 1, "[PAD]": 2, "[UNK]": 3, "[MASK]": 4, "[ILLEGAL]": 5, "~": 6, ">": 7, " ": 8, "#": 9, "+": 10, "-": 11, "/": 12, "0": 13, "1": 14, "2": 15, "3": 16, "4": 17, "5": 18, "6": 19, "7": 20, "8": 21, "9": 22, "=": 23, "B": 24, "K": 25, "N": 26, "O": 27, "P": 28, "Q": 29, "R": 30, "a": 31, "b": 32, "c": 33, "d": 34, "e": 35, "f": 36, "g": 37, "h": 38, "k": 39, "n": 40, "p": 41, "q": 42, "r": 43, "w": 44, "x": 45, "_": 46 } } }