name: bpe_tokenizer | |
config_type: preprocessor | |
max_length: 512 | |
truncation_strategy: longest_first | |
truncation_direction: right | |
stride: 0 | |
padding_strategy: longest | |
padding_direction: right | |
pad_to_multiple_of: 0 | |
pad_token_type_id: 0 | |
bos_token: <s> | |
eos_token: </s> | |
unk_token: <unk> | |
sep_token: <sep> | |
pad_token: </s> | |
cls_token: <cls> | |
mask_token: <mask> | |
special_tokens: | |
- <s> | |
- <pad> | |
- </s> | |
- <unk> | |
- <mask> | |
- <|endoftext|> | |
- <|startoftext|> | |
- <nl> | |
- <hs> | |
- <sep> | |
- <cls> | |
continuing_subword_prefix: '' | |
end_of_word_suffix: '' | |
fuse_unk: false | |
vocab_size: 42000 | |
min_frequency: 2 | |
limit_alphabet: 1000 | |
initial_alphabet: [] | |
show_progress: true | |