mihaimasala
commited on
Update README.md
Browse files
README.md
CHANGED
@@ -35,7 +35,7 @@ model-index:
|
|
35 |
metrics:
|
36 |
- name: Average accuracy
|
37 |
type: accuracy
|
38 |
-
value: 52.
|
39 |
- task:
|
40 |
type: text-generation
|
41 |
dataset:
|
@@ -62,7 +62,7 @@ model-index:
|
|
62 |
metrics:
|
63 |
- name: Average accuracy
|
64 |
type: accuracy
|
65 |
-
value: 65.
|
66 |
- task:
|
67 |
type: text-generation
|
68 |
dataset:
|
@@ -89,7 +89,7 @@ model-index:
|
|
89 |
metrics:
|
90 |
- name: Average accuracy
|
91 |
type: accuracy
|
92 |
-
value: 47.
|
93 |
- task:
|
94 |
type: text-generation
|
95 |
dataset:
|
@@ -98,7 +98,7 @@ model-index:
|
|
98 |
metrics:
|
99 |
- name: Average macro-f1
|
100 |
type: macro-f1
|
101 |
-
value:
|
102 |
- task:
|
103 |
type: text-generation
|
104 |
dataset:
|
@@ -107,7 +107,7 @@ model-index:
|
|
107 |
metrics:
|
108 |
- name: Average macro-f1
|
109 |
type: macro-f1
|
110 |
-
value:
|
111 |
- task:
|
112 |
type: text-generation
|
113 |
dataset:
|
@@ -134,7 +134,7 @@ model-index:
|
|
134 |
metrics:
|
135 |
- name: Average bleu
|
136 |
type: bleu
|
137 |
-
value:
|
138 |
- task:
|
139 |
type: text-generation
|
140 |
dataset:
|
@@ -143,7 +143,7 @@ model-index:
|
|
143 |
metrics:
|
144 |
- name: Average bleu
|
145 |
type: bleu
|
146 |
-
value:
|
147 |
- task:
|
148 |
type: text-generation
|
149 |
dataset:
|
@@ -170,7 +170,7 @@ model-index:
|
|
170 |
metrics:
|
171 |
- name: Average exact_match
|
172 |
type: exact_match
|
173 |
-
value:
|
174 |
- task:
|
175 |
type: text-generation
|
176 |
dataset:
|
@@ -179,7 +179,7 @@ model-index:
|
|
179 |
metrics:
|
180 |
- name: Average f1
|
181 |
type: f1
|
182 |
-
value:
|
183 |
- task:
|
184 |
type: text-generation
|
185 |
dataset:
|
@@ -206,7 +206,7 @@ model-index:
|
|
206 |
metrics:
|
207 |
- name: Average spearman
|
208 |
type: spearman
|
209 |
-
value:
|
210 |
- task:
|
211 |
type: text-generation
|
212 |
dataset:
|
@@ -215,7 +215,7 @@ model-index:
|
|
215 |
metrics:
|
216 |
- name: Average pearson
|
217 |
type: pearson
|
218 |
-
value:
|
219 |
- task:
|
220 |
type: text-generation
|
221 |
dataset:
|
@@ -299,7 +299,7 @@ model-index:
|
|
299 |
value: 64.40
|
300 |
- name: 1-shot
|
301 |
type: accuracy
|
302 |
-
value: 66.
|
303 |
- name: 3-shot
|
304 |
type: accuracy
|
305 |
value: 65.75
|
@@ -350,16 +350,16 @@ model-index:
|
|
350 |
metrics:
|
351 |
- name: 0-shot
|
352 |
type: macro-f1
|
353 |
-
value:
|
354 |
- name: 1-shot
|
355 |
type: macro-f1
|
356 |
-
value:
|
357 |
- name: 3-shot
|
358 |
type: macro-f1
|
359 |
-
value:
|
360 |
- name: 5-shot
|
361 |
type: macro-f1
|
362 |
-
value:
|
363 |
- task:
|
364 |
type: text-generation
|
365 |
dataset:
|
@@ -368,16 +368,16 @@ model-index:
|
|
368 |
metrics:
|
369 |
- name: 0-shot
|
370 |
type: macro-f1
|
371 |
-
value:
|
372 |
- name: 1-shot
|
373 |
type: macro-f1
|
374 |
-
value:
|
375 |
- name: 3-shot
|
376 |
type: macro-f1
|
377 |
-
value:
|
378 |
- name: 5-shot
|
379 |
type: macro-f1
|
380 |
-
value:
|
381 |
- task:
|
382 |
type: text-generation
|
383 |
dataset:
|
@@ -386,16 +386,16 @@ model-index:
|
|
386 |
metrics:
|
387 |
- name: 0-shot
|
388 |
type: bleu
|
389 |
-
value:
|
390 |
- name: 1-shot
|
391 |
type: bleu
|
392 |
-
value:
|
393 |
- name: 3-shot
|
394 |
type: bleu
|
395 |
-
value:
|
396 |
- name: 5-shot
|
397 |
type: bleu
|
398 |
-
value:
|
399 |
- task:
|
400 |
type: text-generation
|
401 |
dataset:
|
@@ -404,16 +404,16 @@ model-index:
|
|
404 |
metrics:
|
405 |
- name: 0-shot
|
406 |
type: bleu
|
407 |
-
value:
|
408 |
- name: 1-shot
|
409 |
type: bleu
|
410 |
-
value:
|
411 |
- name: 3-shot
|
412 |
type: bleu
|
413 |
-
value:
|
414 |
- name: 5-shot
|
415 |
type: bleu
|
416 |
-
value:
|
417 |
- task:
|
418 |
type: text-generation
|
419 |
dataset:
|
@@ -422,16 +422,16 @@ model-index:
|
|
422 |
metrics:
|
423 |
- name: 0-shot
|
424 |
type: exact_match
|
425 |
-
value:
|
426 |
- name: 1-shot
|
427 |
type: exact_match
|
428 |
-
value:
|
429 |
- name: 3-shot
|
430 |
type: exact_match
|
431 |
-
value:
|
432 |
- name: 5-shot
|
433 |
type: exact_match
|
434 |
-
value:
|
435 |
- task:
|
436 |
type: text-generation
|
437 |
dataset:
|
@@ -440,16 +440,16 @@ model-index:
|
|
440 |
metrics:
|
441 |
- name: 0-shot
|
442 |
type: f1
|
443 |
-
value:
|
444 |
- name: 1-shot
|
445 |
type: f1
|
446 |
-
value:
|
447 |
- name: 3-shot
|
448 |
type: f1
|
449 |
-
value:
|
450 |
- name: 5-shot
|
451 |
type: f1
|
452 |
-
value:
|
453 |
- task:
|
454 |
type: text-generation
|
455 |
dataset:
|
@@ -458,13 +458,13 @@ model-index:
|
|
458 |
metrics:
|
459 |
- name: 0-shot
|
460 |
type: spearman
|
461 |
-
value:
|
462 |
- name: 1-shot
|
463 |
type: spearman
|
464 |
-
value:
|
465 |
- name: 3-shot
|
466 |
type: spearman
|
467 |
-
value:
|
468 |
- task:
|
469 |
type: text-generation
|
470 |
dataset:
|
@@ -473,13 +473,14 @@ model-index:
|
|
473 |
metrics:
|
474 |
- name: 0-shot
|
475 |
type: pearson
|
476 |
-
value:
|
477 |
- name: 1-shot
|
478 |
type: pearson
|
479 |
-
value:
|
480 |
- name: 3-shot
|
481 |
type: pearson
|
482 |
-
value:
|
|
|
483 |
|
484 |
---
|
485 |
|
@@ -608,13 +609,13 @@ print(tokenizer.decode(outputs[0]))
|
|
608 |
<td><center><strong>RO-EN<br>(Bleu)</strong></center>
|
609 |
</tr>
|
610 |
<tr>
|
611 |
-
<td>Llama-3.1-8B-Instruct</td><td><center
|
612 |
</tr>
|
613 |
<tr>
|
614 |
<td>RoLlama3.1-8b-Instruct-2024-10-09</td><td><center>94.56</center></td><td><center><strong>60.10</strong></center></td><td><center>95.12</center></td><td><center><strong>87.53</strong></center></td><td><center><strong>21.88</strong></center></td><td><center>23.99</center></td><td><center>28.27</center></td><td><center><strong>40.44</strong></center></td>
|
615 |
</tr>
|
616 |
<tr>
|
617 |
-
<td><em>RoLlama3.1-8b-Instruct-DPO-2024-10-09</em></td><td><center><em
|
618 |
</tr>
|
619 |
</tbody>
|
620 |
</table>
|
@@ -649,15 +650,14 @@ print(tokenizer.decode(outputs[0]))
|
|
649 |
<td>Llama-3.1-8B-Instruct</td><td><center><strong>44.96</strong></center></td><td><center><strong>64.45</strong></center></td><td><center><strong>69.50</strong></center></td><td><center><strong>84.31</strong></center></td><td><center>72.11</center></td><td><center>71.64</center></td><td><center>84.59</center></td><td><center>84.96</center></td>
|
650 |
</tr>
|
651 |
<tr>
|
652 |
-
<td>RoLlama3.1-8b-Instruct-2024-10-09</td><td><center>13.59</center></td><td><center>23.56</center></td><td><center>49.41</center></td><td><center>62.93</center></td><td><center
|
653 |
</tr>
|
654 |
<tr>
|
655 |
-
<td><em>RoLlama3.1-8b-Instruct-DPO-2024-10-09</em></td><td><center><em
|
656 |
</tr>
|
657 |
</tbody>
|
658 |
</table>
|
659 |
|
660 |
-
|
661 |
## MT-Bench
|
662 |
|
663 |
<table>
|
|
|
35 |
metrics:
|
36 |
- name: Average accuracy
|
37 |
type: accuracy
|
38 |
+
value: 52.74
|
39 |
- task:
|
40 |
type: text-generation
|
41 |
dataset:
|
|
|
62 |
metrics:
|
63 |
- name: Average accuracy
|
64 |
type: accuracy
|
65 |
+
value: 65.87
|
66 |
- task:
|
67 |
type: text-generation
|
68 |
dataset:
|
|
|
89 |
metrics:
|
90 |
- name: Average accuracy
|
91 |
type: accuracy
|
92 |
+
value: 47.82
|
93 |
- task:
|
94 |
type: text-generation
|
95 |
dataset:
|
|
|
98 |
metrics:
|
99 |
- name: Average macro-f1
|
100 |
type: macro-f1
|
101 |
+
value: 96.10
|
102 |
- task:
|
103 |
type: text-generation
|
104 |
dataset:
|
|
|
107 |
metrics:
|
108 |
- name: Average macro-f1
|
109 |
type: macro-f1
|
110 |
+
value: 55.37
|
111 |
- task:
|
112 |
type: text-generation
|
113 |
dataset:
|
|
|
134 |
metrics:
|
135 |
- name: Average bleu
|
136 |
type: bleu
|
137 |
+
value: 21.29
|
138 |
- task:
|
139 |
type: text-generation
|
140 |
dataset:
|
|
|
143 |
metrics:
|
144 |
- name: Average bleu
|
145 |
type: bleu
|
146 |
+
value: 21.86
|
147 |
- task:
|
148 |
type: text-generation
|
149 |
dataset:
|
|
|
170 |
metrics:
|
171 |
- name: Average exact_match
|
172 |
type: exact_match
|
173 |
+
value: 21.58
|
174 |
- task:
|
175 |
type: text-generation
|
176 |
dataset:
|
|
|
179 |
metrics:
|
180 |
- name: Average f1
|
181 |
type: f1
|
182 |
+
value: 36.54
|
183 |
- task:
|
184 |
type: text-generation
|
185 |
dataset:
|
|
|
206 |
metrics:
|
207 |
- name: Average spearman
|
208 |
type: spearman
|
209 |
+
value: 78.01
|
210 |
- task:
|
211 |
type: text-generation
|
212 |
dataset:
|
|
|
215 |
metrics:
|
216 |
- name: Average pearson
|
217 |
type: pearson
|
218 |
+
value: 77.98
|
219 |
- task:
|
220 |
type: text-generation
|
221 |
dataset:
|
|
|
299 |
value: 64.40
|
300 |
- name: 1-shot
|
301 |
type: accuracy
|
302 |
+
value: 66.22
|
303 |
- name: 3-shot
|
304 |
type: accuracy
|
305 |
value: 65.75
|
|
|
350 |
metrics:
|
351 |
- name: 0-shot
|
352 |
type: macro-f1
|
353 |
+
value: 93.11
|
354 |
- name: 1-shot
|
355 |
type: macro-f1
|
356 |
+
value: 96.06
|
357 |
- name: 3-shot
|
358 |
type: macro-f1
|
359 |
+
value: 97.53
|
360 |
- name: 5-shot
|
361 |
type: macro-f1
|
362 |
+
value: 97.70
|
363 |
- task:
|
364 |
type: text-generation
|
365 |
dataset:
|
|
|
368 |
metrics:
|
369 |
- name: 0-shot
|
370 |
type: macro-f1
|
371 |
+
value: 65.61
|
372 |
- name: 1-shot
|
373 |
type: macro-f1
|
374 |
+
value: 55.73
|
375 |
- name: 3-shot
|
376 |
type: macro-f1
|
377 |
+
value: 46.33
|
378 |
- name: 5-shot
|
379 |
type: macro-f1
|
380 |
+
value: 53.82
|
381 |
- task:
|
382 |
type: text-generation
|
383 |
dataset:
|
|
|
386 |
metrics:
|
387 |
- name: 0-shot
|
388 |
type: bleu
|
389 |
+
value: 6.89
|
390 |
- name: 1-shot
|
391 |
type: bleu
|
392 |
+
value: 26.62
|
393 |
- name: 3-shot
|
394 |
type: bleu
|
395 |
+
value: 25.70
|
396 |
- name: 5-shot
|
397 |
type: bleu
|
398 |
+
value: 25.94
|
399 |
- task:
|
400 |
type: text-generation
|
401 |
dataset:
|
|
|
404 |
metrics:
|
405 |
- name: 0-shot
|
406 |
type: bleu
|
407 |
+
value: 2.16
|
408 |
- name: 1-shot
|
409 |
type: bleu
|
410 |
+
value: 16.65
|
411 |
- name: 3-shot
|
412 |
type: bleu
|
413 |
+
value: 33.41
|
414 |
- name: 5-shot
|
415 |
type: bleu
|
416 |
+
value: 35.22
|
417 |
- task:
|
418 |
type: text-generation
|
419 |
dataset:
|
|
|
422 |
metrics:
|
423 |
- name: 0-shot
|
424 |
type: exact_match
|
425 |
+
value: 8.99
|
426 |
- name: 1-shot
|
427 |
type: exact_match
|
428 |
+
value: 35.88
|
429 |
- name: 3-shot
|
430 |
type: exact_match
|
431 |
+
value: 31.26
|
432 |
- name: 5-shot
|
433 |
type: exact_match
|
434 |
+
value: 10.17
|
435 |
- task:
|
436 |
type: text-generation
|
437 |
dataset:
|
|
|
440 |
metrics:
|
441 |
- name: 0-shot
|
442 |
type: f1
|
443 |
+
value: 20.00
|
444 |
- name: 1-shot
|
445 |
type: f1
|
446 |
+
value: 59.41
|
447 |
- name: 3-shot
|
448 |
type: f1
|
449 |
+
value: 48.41
|
450 |
- name: 5-shot
|
451 |
type: f1
|
452 |
+
value: 18.33
|
453 |
- task:
|
454 |
type: text-generation
|
455 |
dataset:
|
|
|
458 |
metrics:
|
459 |
- name: 0-shot
|
460 |
type: spearman
|
461 |
+
value: 78.10
|
462 |
- name: 1-shot
|
463 |
type: spearman
|
464 |
+
value: 77.81
|
465 |
- name: 3-shot
|
466 |
type: spearman
|
467 |
+
value: 78.11
|
468 |
- task:
|
469 |
type: text-generation
|
470 |
dataset:
|
|
|
473 |
metrics:
|
474 |
- name: 0-shot
|
475 |
type: pearson
|
476 |
+
value: 78.30
|
477 |
- name: 1-shot
|
478 |
type: pearson
|
479 |
+
value: 77.58
|
480 |
- name: 3-shot
|
481 |
type: pearson
|
482 |
+
value: 78.06
|
483 |
+
|
484 |
|
485 |
---
|
486 |
|
|
|
609 |
<td><center><strong>RO-EN<br>(Bleu)</strong></center>
|
610 |
</tr>
|
611 |
<tr>
|
612 |
+
<td>Llama-3.1-8B-Instruct</td><td><center>95.74</center></td><td><center>59.49</center></td><td><center><strong>98.57</strong></center></td><td><center>82.41</center></td><td><center>19.01</center></td><td><center><strong>27.77</strong></center></td><td><center><strong>29.02</strong></center></td><td><center>39.80</center></td>
|
613 |
</tr>
|
614 |
<tr>
|
615 |
<td>RoLlama3.1-8b-Instruct-2024-10-09</td><td><center>94.56</center></td><td><center><strong>60.10</strong></center></td><td><center>95.12</center></td><td><center><strong>87.53</strong></center></td><td><center><strong>21.88</strong></center></td><td><center>23.99</center></td><td><center>28.27</center></td><td><center><strong>40.44</strong></center></td>
|
616 |
</tr>
|
617 |
<tr>
|
618 |
+
<td><em>RoLlama3.1-8b-Instruct-DPO-2024-10-09</em></td><td><center><em><strong>96.10</strong></em></center></td><td><center><em>55.37</em></center></td><td><center><em>-</em></center></td><td><center><em>-</em></center></td><td><center><em>21.29</em></center></td><td><center><em>21.86</em></center></td><td><center><em>-</em></center></td><td><center><em>-</em></center></td>
|
619 |
</tr>
|
620 |
</tbody>
|
621 |
</table>
|
|
|
650 |
<td>Llama-3.1-8B-Instruct</td><td><center><strong>44.96</strong></center></td><td><center><strong>64.45</strong></center></td><td><center><strong>69.50</strong></center></td><td><center><strong>84.31</strong></center></td><td><center>72.11</center></td><td><center>71.64</center></td><td><center>84.59</center></td><td><center>84.96</center></td>
|
651 |
</tr>
|
652 |
<tr>
|
653 |
+
<td>RoLlama3.1-8b-Instruct-2024-10-09</td><td><center>13.59</center></td><td><center>23.56</center></td><td><center>49.41</center></td><td><center>62.93</center></td><td><center>75.89</center></td><td><center>76.00</center></td><td><center><strong>86.86</strong></center></td><td><center><strong>87.05</strong></center></td>
|
654 |
</tr>
|
655 |
<tr>
|
656 |
+
<td><em>RoLlama3.1-8b-Instruct-DPO-2024-10-09</em></td><td><center><em>21.58</em></center></td><td><center><em>36.54</em></center></td><td><center><em>-</em></center></td><td><center><em>-</em></center></td><td><center><em><strong>78.01</strong></em></center></td><td><center><em><strong>77.98</strong></em></center></td><td><center><em>-</em></center></td><td><center><em>-</em></center></td>
|
657 |
</tr>
|
658 |
</tbody>
|
659 |
</table>
|
660 |
|
|
|
661 |
## MT-Bench
|
662 |
|
663 |
<table>
|