wilsonmarciliojr commited on
Commit
e1025e3
·
verified ·
1 Parent(s): 75186c8

Upload data_budget_hours_24.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. data_budget_hours_24.json +33 -0
data_budget_hours_24.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sources": {
3
+ "ag_news": {
4
+ "provider": "huggingface",
5
+ "split": "train",
6
+ "streaming": false,
7
+ "remove_columns": "label",
8
+ "concatenate_successive_entries": 0
9
+ }
10
+ },
11
+ "name": "sanity-check-2",
12
+ "normalizer": {
13
+ "force_lowercase": true,
14
+ "strip_accents": true,
15
+ "force_english_keyboard": true,
16
+ "whitespace_escape": false
17
+ },
18
+ "tokenizer": "BPE",
19
+ "vocab_size": 32768,
20
+ "seq_length": 128,
21
+ "include_cls_token_in_corpus": false,
22
+ "include_sep_token_in_corpus": false,
23
+ "use_type_ids": false,
24
+ "max_entries_in_raw_dataset": 10000000000.0,
25
+ "max_seq_in_tokenized_dataset": 10000000000.0,
26
+ "named_entity_simplification": false,
27
+ "remove_whitespaces": false,
28
+ "remove_trash": false,
29
+ "trash_cutoff": 0.3,
30
+ "deduplicate_entries": false,
31
+ "deduplication_threshold": 100,
32
+ "ordering": "randomized"
33
+ }