DGurgurov commited on
Commit
401e133
·
verified ·
1 Parent(s): 3b32078

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +75 -306
README.md CHANGED
@@ -32,313 +32,82 @@ def read_embeddings_from_text(file_path, embedding_size=300):
32
  return embeddings
33
  ```
34
 
35
- ### Dataset Details
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
- | Language Code | Vocabulary Size |
38
- | --- | ------- |
39
- | ab | 252 |
40
- | adx | 549 |
41
- | ae | 192 |
42
- | af | 12973 |
43
- | ang | 9788 |
44
- | ar | 75684 |
45
- | arc | 1688 |
46
- | arn | 1181 |
47
- | ast | 27485 |
48
- | av | 172 |
49
- | az | 13277 |
50
- | ba | 4250 |
51
- | bal | 370 |
52
- | be | 14871 |
53
- | bg | 171740 |
54
- | bm | 2422 |
55
- | bn | 7306 |
56
- | bo | 2127 |
57
- | br | 11665 |
58
- | ca | 82706 |
59
- | ce | 2311 |
60
- | ceb | 18882 |
61
- | chk | 724 |
62
- | cim | 889 |
63
- | cop | 1071 |
64
- | crh | 2449 |
65
- | cs | 77422 |
66
- | csb | 602 |
67
- | cu | 7526 |
68
- | cy | 13243 |
69
- | da | 46600 |
70
- | de | 500260 |
71
- | dsb | 3993 |
72
- | ee | 571 |
73
- | egl | 854 |
74
- | egx | 1890 |
75
- | egy | 447 |
76
- | el | 39667 |
77
- | en | 941858 |
78
- | enm | 17286 |
79
- | eo | 91074 |
80
- | es | 646097 |
81
- | et | 20088 |
82
- | eu | 41427 |
83
- | fa | 46736 |
84
- | fi | 259852 |
85
- | fil | 16165 |
86
- | fj | 209 |
87
- | fo | 10513 |
88
- | fr | 1449790 |
89
- | frm | 4472 |
90
- | fro | 14493 |
91
- | frp | 2799 |
92
- | frr | 476 |
93
- | fur | 2295 |
94
- | fy | 7608 |
95
- | ga | 29459 |
96
- | gag | 505 |
97
- | gd | 14418 |
98
- | gl | 52824 |
99
- | gml | 177 |
100
- | got | 2982 |
101
- | grc | 25689 |
102
- | gu | 4427 |
103
- | gv | 6812 |
104
- | haw | 1371 |
105
- | hbo | 2898 |
106
- | he | 27283 |
107
- | hi | 18363 |
108
- | hil | 1414 |
109
- | hsb | 25778 |
110
- | ht | 2699 |
111
- | hu | 65163 |
112
- | hy | 23434 |
113
- | ia | 5728 |
114
- | io | 21076 |
115
- | is | 40287 |
116
- | ist | 422 |
117
- | it | 548767 |
118
- | iu | 1871 |
119
- | ja | 283049 |
120
- | ka | 25014 |
121
- | khb | 297 |
122
- | ki | 1374 |
123
- | kjh | 482 |
124
- | kk | 13700 |
125
- | kl | 1427 |
126
- | km | 3466 |
127
- | ko | 30616 |
128
- | koy | 205 |
129
- | ku | 9737 |
130
- | kw | 1797 |
131
- | ky | 3574 |
132
- | la | 848943 |
133
- | lad | 1453 |
134
- | lb | 10863 |
135
- | li | 485 |
136
- | lij | 1331 |
137
- | lld | 4884 |
138
- | lmo | 2109 |
139
- | ln | 4109 |
140
- | lo | 1422 |
141
- | lt | 21184 |
142
- | lv | 30059 |
143
- | mdf | 2086
144
- | mg | 26575 |
145
- | mga | 178 |
146
- | mi | 945 |
147
- | mk | 28935 |
148
- | mn | 6740 |
149
- | ms | 88416 |
150
- | mt | 2006 |
151
- | mul | 16034 |
152
- | mwl | 1302 |
153
- | my | 4875 |
154
- | myv | 642 |
155
- | nap | 1506 |
156
- | nci | 3358 |
157
- | nds | 5192 |
158
- | nl | 138580 |
159
- | no | 94946 |
160
- | nog | 450 |
161
- | non | 4079 |
162
- | nov | 649 |
163
- | nrf | 9724 |
164
- | nv | 6333 |
165
- | oc | 22113 |
166
- | oge | 438 |
167
- | osp | 458 |
168
- | ota | 834 |
169
- | pal | 256 |
170
- | pcd | 1424 |
171
- | pi | 1828 |
172
- | pjt | 364 |
173
- | pl | 139396 |
174
- | ppl | 268 |
175
- | pro | 2798 |
176
- | ps | 1087 |
177
- | pt | 248669 |
178
- | rm | 3919 |
179
- | ro | 36206 |
180
- | rom | 552 |
181
- | ru | 424944 |
182
- | rue | 200 |
183
- | rup | 3079 |
184
- | rw | 355 |
185
- | sa | 5789 |
186
- | scn | 4749 |
187
- | sco | 8537 |
188
- | se | 68758 |
189
- | ses | 3095 |
190
- | sga | 2913 |
191
- | sh | 57974 |
192
- | sk | 21657 |
193
- | sl | 89210 |
194
- | sm | 588 |
195
- | so | 593 |
196
- | sq | 16262 |
197
- | stq | 1237 |
198
- | su | 2514 |
199
- | sv | 133965 |
200
- | sw | 9131 |
201
- | swb | 672 |
202
- | syc | 2855 |
203
- | szl | 237 |
204
- | ta | 9064 |
205
- | te | 18707 |
206
- | tg | 2937 |
207
- | th | 94281 |
208
- | tk | 815 |
209
- | tpi | 1511 |
210
- | tpw | 270 |
211
- | tr | 38490 |
212
- | tt | 4676 |
213
- | ty | 293 |
214
- | tyv | 337 |
215
- | ug | 998 |
216
- | uk | 27682 |
217
- | ur | 8476 |
218
- | uz | 5224 |
219
- | vec | 5555 |
220
- | vep | 2867 |
221
- | vi | 37433 |
222
- | vo | 8277 |
223
- | vot | 489 |
224
- | wa | 1956 |
225
- | wau | 184 |
226
- | wo | 1196 |
227
- | wym | 1330 |
228
- | xcl | 16182 |
229
- | yi | 8054 |
230
- | yua | 735 |
231
- | za | 473 |
232
- | zh | 274080 |
233
- | zza | 621 |
234
- | abe | 185 |
235
- | ady | 3807 |
236
- | ain | 298 |
237
- | akk | 313 |
238
- | akz | 151 |
239
- | alt | 289 |
240
- | an | 4457 |
241
- | axm | 350 |
242
- | ccc | 445 |
243
- | ch | 174 |
244
- | chl | 528 |
245
- | cho | 155 |
246
- | chr | 1087 |
247
- | cic | 699 |
248
- | cjs | 306 |
249
- | cv | 2892 |
250
- | dlm | 1091 |
251
- | dum | 2040 |
252
- | esu | 227 |
253
- | ff | 215 |
254
- | gmh | 217 |
255
- | gn | 131 |
256
- | goh | 2002 |
257
- | gsw | 2336 |
258
- | ha | 802 |
259
- | hit | 221 |
260
- | ie | 637 |
261
- | ii | 51 |
262
- | ilo | 442 |
263
- | jv | 4919 |
264
- | kbd | 762 |
265
- | kn | 3415 |
266
- | krl | 637 |
267
- | liv | 569 |
268
- | lkt | 682 |
269
- | ltg | 139 |
270
- | lzz | 127 |
271
- | mch | 384 |
272
- | mh | 200 |
273
- | ml | 6750 |
274
- | mr | 5545 |
275
- | na | 200 |
276
- | nah | 1612 |
277
- | nan | 486 |
278
- | ne | 4224 |
279
- | nhn | 269 |
280
- | nmn | 313 |
281
- | odt | 365 |
282
- | ofs | 345 |
283
- | oj | 587 |
284
- | or | 109 |
285
- | orv | 199 |
286
- | os | 4481 |
287
- | osx | 1848 |
288
- | pa | 4488 |
289
- | pap | 3612 |
290
- | peo | 184 |
291
- | pms | 2857 |
292
- | qu | 5156 |
293
- | raj | 190 |
294
- | rap | 313 |
295
- | sah | 2695 |
296
- | sc | 573 |
297
- | sd | 143 |
298
- | si | 2062 |
299
- | smn | 511 |
300
- | sms | 493 |
301
- | srn | 1249 |
302
- | sux | 785 |
303
- | tet | 361 |
304
- | twf | 527 |
305
- | txb | 588 |
306
- | uga | 573 |
307
- | war | 12987 |
308
- | xh | 2504 |
309
- | xmf | 149 |
310
- | xpr | 98 |
311
- | xwo | 456 |
312
- | yo | 2283 |
313
- | zu | 2758 |
314
- | co | 1474 |
315
- | prg | 480 |
316
- | aii | 345 |
317
- | am | 1909 |
318
- | bi | 92 |
319
- | dv | 117 |
320
- | kim | 388 |
321
- | krc | 460 |
322
- | kum | 505 |
323
- | ti | 292 |
324
- | udm | 306 |
325
- | xto | 121 |
326
- | zdj | 58 |
327
- | dak | 879 |
328
- | frk | 1 |
329
- | oma | 748 |
330
- | shh | 185 |
331
- | aa | 725 |
332
- | dje | 338 |
333
- | hke | 246 |
334
- | qya | 180 |
335
- | st | 102 |
336
- | wae | 437 |
337
- | xno | 274 |
338
- | dua | 317 |
339
- | fon | 805 |
340
- | hak | 4 |
341
- | jbo | 32 |
342
 
343
  ### Licensing Information
344
 
 
32
  return embeddings
33
  ```
34
 
35
+ ### Language Details
36
+
37
+ | Language Code | Language Name | Vocabulary Size|
38
+ | --- | --- | --- |
39
+ | af | Afrikaans | 12973 |
40
+ | sc | Sardinian | 573 |
41
+ | yo | Yoruba | 2283 |
42
+ | gn | Guarani | 131 |
43
+ | qu | Quechua | 5156 |
44
+ | li | Limburgish | 485 |
45
+ | ln | Lingala | 4109 |
46
+ | wo | Wolof | 1196 |
47
+ | zu | Zulu | 2758 |
48
+ | rm | Romansh | 3919 |
49
+ | ht | Haitian Creole | 2699 |
50
+ | su | Sundanese | 2514 |
51
+ | br | Breton | 11665 |
52
+ | gd | Scottish Gaelic | 14418 |
53
+ | xh | Xhosa | 2504 |
54
+ | mg | Malagasy | 26575 |
55
+ | jv | Javanese | 4919 |
56
+ | fy | Frisian | 7608 |
57
+ | sa | Sanskrit | 5789 |
58
+ | my | Burmese | 4875 |
59
+ | ug | Uyghur | 998 |
60
+ | yi | Yiddish | 8054 |
61
+ | or | Oriya | 109 |
62
+ | ha | Hausa | 802 |
63
+ | la | Latin | 848943 |
64
+ | sd | Sindhi | 143 |
65
+ | so | Somali | 593 |
66
+ | ku | Kurdish | 9737 |
67
+ | pa | Punjabi | 4488 |
68
+ | ps | Pashto | 1087 |
69
+ | ga | Irish | 29459 |
70
+ | am | Amharic | 1909 |
71
+ | km | Khmer | 3466 |
72
+ | uz | Uzbek | 5224 |
73
+ | ky | Kyrgyz | 3574 |
74
+ | cy | Welsh | 13243 |
75
+ | gu | Gujarati | 4427 |
76
+ | eo | Esperanto | 91074 |
77
+ | sw | Swahili | 9131 |
78
+ | mr | Marathi | 5545 |
79
+ | kn | Kannada | 3415 |
80
+ | ne | Nepali | 4224 |
81
+ | mn | Mongolian | 6740 |
82
+ | si | Sinhala | 2062 |
83
+ | te | Telugu | 18707 |
84
+ | be | Belarusian | 14871 |
85
+ | mk | Macedonian | 28935 |
86
+ | gl | Galician | 52824 |
87
+ | hy | Armenian | 23434 |
88
+ | is | Icelandic | 40287 |
89
+ | ml | Malayalam | 6750 |
90
+ | bn | Bengali | 7306 |
91
+ | ur | Urdu | 8476 |
92
+ | kk | Kazakh | 13700 |
93
+ | ka | Georgian | 25014 |
94
+ | az | Azerbaijani | 13277 |
95
+ | sq | Albanian | 16262 |
96
+ | ta | Tamil | 9064 |
97
+ | et | Estonian | 20088 |
98
+ | lv | Latvian | 30059 |
99
+ | ms | Malay | 88416 |
100
+ | sl | Slovenian | 89210 |
101
+ | lt | Lithuanian | 21184 |
102
+ | he | Hebrew | 27283 |
103
+ | sk | Slovak | 21657 |
104
+ | el | Greek | 39667 |
105
+ | th | Thai | 94281 |
106
+ | bg | Bulgarian | 171740 |
107
+ | da | Danish | 46600 |
108
+ | uk | Ukrainian | 27682 |
109
+ | ro | Romanian | 36206 |
110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
  ### Licensing Information
113