Upload darija_tokenizers_leaderboard.jsonl with huggingface_hub
Browse files
darija_tokenizers_leaderboard.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
{"Tokenizer":"google-bert\/bert-base-uncased","Vocabulary Size":30522,"Token Count":28,"Tokens\/Character Ratio":0.9333333333,"Latin Support":"✅","Tokenizer Class":"BertTokenizerFast"}
|
2 |
+
{"Tokenizer":"google\/gemma-2-27b-it","Vocabulary Size":256000,"Token Count":10,"Tokens\/Character Ratio":0.3333333333,"Latin Support":"✅","Tokenizer Class":"GemmaTokenizer"}
|
3 |
+
{"Tokenizer":"Xenova\/gpt-4o","Vocabulary Size":200000,"Token Count":8,"Tokens\/Character Ratio":0.2666666667,"Latin Support":"✅","Tokenizer Class":"GPT2TokenizerFast"}
|