pietrolesci commited on
Commit
a9481f6
·
verified ·
1 Parent(s): 4965a1e

Upload folder using huggingface_hub

Browse files
wordpiece32000minipile/raw_tok_path.txt CHANGED
@@ -1 +1 @@
1
- /home/pl487/rdd/outputs/tok_train/wordpiece_minipile_2025-02-03T10-57-36
 
1
+ /home/pl487/rdd/outputs/tok_train/wordpiece_minipile_2025-02-03T12-10-57
wordpiece32000minipile/special_tokens_map.json CHANGED
@@ -1,3 +1,4 @@
1
  {
2
- "eos_token": "<|endoftext|>"
 
3
  }
 
1
  {
2
+ "eos_token": "<|endoftext|>",
3
+ "unk_token": "<|unk|>"
4
  }
wordpiece32000minipile/tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
wordpiece32000minipile/tokenizer_config.json CHANGED
@@ -7,10 +7,19 @@
7
  "rstrip": false,
8
  "single_word": false,
9
  "special": true
 
 
 
 
 
 
 
 
10
  }
11
  },
12
  "clean_up_tokenization_spaces": true,
13
  "eos_token": "<|endoftext|>",
14
  "model_max_length": 1000000000000000019884624838656,
15
- "tokenizer_class": "PreTrainedTokenizerFast"
 
16
  }
 
7
  "rstrip": false,
8
  "single_word": false,
9
  "special": true
10
+ },
11
+ "1": {
12
+ "content": "<|unk|>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
  }
19
  },
20
  "clean_up_tokenization_spaces": true,
21
  "eos_token": "<|endoftext|>",
22
  "model_max_length": 1000000000000000019884624838656,
23
+ "tokenizer_class": "PreTrainedTokenizerFast",
24
+ "unk_token": "<|unk|>"
25
  }