finiteautomata commited on
Commit
bdea4a7
1 Parent(s): 9e03c82

Upload tokenizer

Browse files
Files changed (3) hide show
  1. added_tokens.json +6 -0
  2. tokenizer.json +36 -0
  3. tokenizer_config.json +3 -3
added_tokens.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "@usuario": 31002,
3
+ "emoji": 31005,
4
+ "hashtag": 31004,
5
+ "url": 31003
6
+ }
tokenizer.json CHANGED
@@ -52,6 +52,42 @@
52
  "rstrip": false,
53
  "normalized": false,
54
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  }
56
  ],
57
  "normalizer": {
 
52
  "rstrip": false,
53
  "normalized": false,
54
  "special": true
55
+ },
56
+ {
57
+ "id": 31002,
58
+ "content": "@usuario",
59
+ "single_word": false,
60
+ "lstrip": false,
61
+ "rstrip": false,
62
+ "normalized": true,
63
+ "special": false
64
+ },
65
+ {
66
+ "id": 31003,
67
+ "content": "url",
68
+ "single_word": false,
69
+ "lstrip": false,
70
+ "rstrip": false,
71
+ "normalized": true,
72
+ "special": false
73
+ },
74
+ {
75
+ "id": 31004,
76
+ "content": "hashtag",
77
+ "single_word": false,
78
+ "lstrip": false,
79
+ "rstrip": false,
80
+ "normalized": true,
81
+ "special": false
82
+ },
83
+ {
84
+ "id": 31005,
85
+ "content": "emoji",
86
+ "single_word": false,
87
+ "lstrip": false,
88
+ "rstrip": false,
89
+ "normalized": true,
90
+ "special": false
91
  }
92
  ],
93
  "normalizer": {
tokenizer_config.json CHANGED
@@ -3,12 +3,12 @@
3
  "do_basic_tokenize": true,
4
  "do_lower_case": false,
5
  "mask_token": "[MASK]",
6
- "model_max_length": 512,
7
- "name_or_path": "dccuchile/bert-base-spanish-wwm-cased",
8
  "never_split": null,
9
  "pad_token": "[PAD]",
10
  "sep_token": "[SEP]",
11
- "special_tokens_map_file": "/root/.cache/huggingface/hub/models--dccuchile--bert-base-spanish-wwm-cased/snapshots/56a7647b957a4230fc3f80dafbe80f2ba9b0de73/special_tokens_map.json",
12
  "strip_accents": false,
13
  "tokenize_chinese_chars": true,
14
  "tokenizer_class": "BertTokenizer",
 
3
  "do_basic_tokenize": true,
4
  "do_lower_case": false,
5
  "mask_token": "[MASK]",
6
+ "model_max_length": 256,
7
+ "name_or_path": "piubamas/betonews-tweetcontext",
8
  "never_split": null,
9
  "pad_token": "[PAD]",
10
  "sep_token": "[SEP]",
11
+ "special_tokens_map_file": "/home/jmperez/.cache/huggingface/transformers/9848a00af462c42dfb4ec88ef438fbab5256330f7f6f50badc48d277f9367d49.f982506b52498d4adb4bd491f593dc92b2ef6be61bfdbe9d30f53f963f9f5b66",
12
  "strip_accents": false,
13
  "tokenize_chinese_chars": true,
14
  "tokenizer_class": "BertTokenizer",