Surprising model performance relative to snowflake-arctic-embed-s
I have been surprised to see the performance of snowflake-arctic-embed-m fall far below that of snowflake-arctic-embed-s on a number of tasks. I am using sentence transformers for inference. Is this expected? For example, running the model on the NFCorpus task - I get the following ncdg@10 scores:
- snowflake-arctic-embed-s: 0.26669
- snowflake-arctic-embed-m: 0.03882
Arctic M 1.0 scores around 0.37 on NFCorpus when run properly. If you want to share the code you are getting this performance issue with, I can take a look and see if anything jumps out as the cause of the issue. You may also want to try running the model directly via the transformers
usage example in the documentation (rather than via sentence transformers) as a point of comparison.
This was using mteb package, and these are the result I obtain:
Expand
{
"dataset_revision": "ec0fa4fe99da2ff19ca1214b7966684033a58814",
"evaluation_time": 19.430282831192017,
"kg_co2_emissions": null,
"mteb_version": "1.13.0",
"scores": {
"test": [
{
"hf_subset": "default",
"languages": [
"eng-Latn"
],
"main_score": 0.03882,
"map_at_1": 0.00256,
"map_at_10": 0.00625,
"map_at_100": 0.01407,
"map_at_1000": 0.02364,
"map_at_20": 0.00852,
"map_at_3": 0.00426,
"map_at_5": 0.00505,
"mrr_at_1": 0.04953560371517028,
"mrr_at_10": 0.09766819008305073,
"mrr_at_100": 0.1143042883049741,
"mrr_at_1000": 0.11564444467525821,
"mrr_at_20": 0.1058668855370875,
"mrr_at_3": 0.08565531475748192,
"mrr_at_5": 0.09029927760577912,
"nauc_map_at_1000_diff1": -0.1958079925108266,
"nauc_map_at_1000_max": -0.06463533976536243,
"nauc_map_at_1000_std": -0.24470446814297303,
"nauc_map_at_100_diff1": -0.2357299122448158,
"nauc_map_at_100_max": -0.11181889175369304,
"nauc_map_at_100_std": -0.29771767768362534,
"nauc_map_at_10_diff1": -0.23516853918208808,
"nauc_map_at_10_max": -0.19049695699159686,
"nauc_map_at_10_std": -0.4162700966729437,
"nauc_map_at_1_diff1": -0.09100310178147415,
"nauc_map_at_1_max": -0.17939713861367018,
"nauc_map_at_1_std": -0.41250006429186176,
"nauc_map_at_20_diff1": -0.24291479999549453,
"nauc_map_at_20_max": -0.1730117500618571,
"nauc_map_at_20_std": -0.3680503640713014,
"nauc_map_at_3_diff1": -0.15604304366169247,
"nauc_map_at_3_max": -0.12550429529358884,
"nauc_map_at_3_std": -0.3892702435741882,
"nauc_map_at_5_diff1": -0.20167340407146864,
"nauc_map_at_5_max": -0.17255469267145876,
"nauc_map_at_5_std": -0.41322834429341687,
"nauc_mrr_at_1000_diff1": -0.07225034635252489,
"nauc_mrr_at_1000_max": 0.01997635570958497,
"nauc_mrr_at_1000_std": -0.2400346557824735,
"nauc_mrr_at_100_diff1": -0.07298416339539703,
"nauc_mrr_at_100_max": 0.020673185524783935,
"nauc_mrr_at_100_std": -0.23838933597992373,
"nauc_mrr_at_10_diff1": -0.07551113225342497,
"nauc_mrr_at_10_max": 0.0061219099428982985,
"nauc_mrr_at_10_std": -0.25940570291941717,
"nauc_mrr_at_1_diff1": -0.03574985182035973,
"nauc_mrr_at_1_max": 0.02909367868170859,
"nauc_mrr_at_1_std": -0.32766470054153524,
"nauc_mrr_at_20_diff1": -0.07831771358408321,
"nauc_mrr_at_20_max": 0.013272569195385508,
"nauc_mrr_at_20_std": -0.23966992947273746,
"nauc_mrr_at_3_diff1": -0.07148078020728597,
"nauc_mrr_at_3_max": 0.01811801271585865,
"nauc_mrr_at_3_std": -0.25720789122345705,
"nauc_mrr_at_5_diff1": -0.08774840338010435,
"nauc_mrr_at_5_max": -0.005518039130420398,
"nauc_mrr_at_5_std": -0.2719436077274807,
"nauc_ndcg_at_1000_diff1": -0.12177846047415185,
"nauc_ndcg_at_1000_max": 0.02013743874973639,
"nauc_ndcg_at_1000_std": -0.2179589333473427,
"nauc_ndcg_at_100_diff1": -0.1712173140186249,
"nauc_ndcg_at_100_max": -0.020565086291765827,
"nauc_ndcg_at_100_std": -0.198317067059126,
"nauc_ndcg_at_10_diff1": -0.1256917455726515,
"nauc_ndcg_at_10_max": -0.10210327765119764,
"nauc_ndcg_at_10_std": -0.26918389433510415,
"nauc_ndcg_at_1_diff1": -0.08268581831481192,
"nauc_ndcg_at_1_max": -0.017858051832516385,
"nauc_ndcg_at_1_std": -0.3035423719371352,
"nauc_ndcg_at_20_diff1": -0.16950748062044646,
"nauc_ndcg_at_20_max": -0.09681105315994616,
"nauc_ndcg_at_20_std": -0.23909989129433282,
"nauc_ndcg_at_3_diff1": -0.084696800689983,
"nauc_ndcg_at_3_max": -0.017406792152505048,
"nauc_ndcg_at_3_std": -0.20580169757068517,
"nauc_ndcg_at_5_diff1": -0.10943770257913095,
"nauc_ndcg_at_5_max": -0.0911106685756062,
"nauc_ndcg_at_5_std": -0.24602697201789522,
"nauc_precision_at_1000_diff1": 0.0377752837428462,
"nauc_precision_at_1000_max": 0.061811760531745946,
"nauc_precision_at_1000_std": 0.007975708651163994,
"nauc_precision_at_100_diff1": -0.12712903534556,
"nauc_precision_at_100_max": 0.03329413273717648,
"nauc_precision_at_100_std": -0.0874882718086588,
"nauc_precision_at_10_diff1": -0.12946170426715875,
"nauc_precision_at_10_max": -0.09720388119034555,
"nauc_precision_at_10_std": -0.2626680017611529,
"nauc_precision_at_1_diff1": -0.03574985182035973,
"nauc_precision_at_1_max": 0.02909367868170859,
"nauc_precision_at_1_std": -0.32766470054153524,
"nauc_precision_at_20_diff1": -0.16230338091388882,
"nauc_precision_at_20_max": -0.0805058959905799,
"nauc_precision_at_20_std": -0.19491601580972975,
"nauc_precision_at_3_diff1": -0.08286780483679686,
"nauc_precision_at_3_max": -0.008275279985569385,
"nauc_precision_at_3_std": -0.20970596893525778,
"nauc_precision_at_5_diff1": -0.12309699294684523,
"nauc_precision_at_5_max": -0.10941749534575063,
"nauc_precision_at_5_std": -0.2588548280752214,
"nauc_recall_at_1000_diff1": -0.11230248060676767,
"nauc_recall_at_1000_max": -0.07164886953796575,
"nauc_recall_at_1000_std": -0.1147887057500913,
"nauc_recall_at_100_diff1": -0.12340708811925778,
"nauc_recall_at_100_max": 0.05773229484602484,
"nauc_recall_at_100_std": -0.09526034964984659,
"nauc_recall_at_10_diff1": -0.24612588197726526,
"nauc_recall_at_10_max": -0.16245926748565806,
"nauc_recall_at_10_std": -0.3555615650141615,
"nauc_recall_at_1_diff1": -0.09100310178147415,
"nauc_recall_at_1_max": -0.17939713861367018,
"nauc_recall_at_1_std": -0.41250006429186176,
"nauc_recall_at_20_diff1": -0.2100927679735405,
"nauc_recall_at_20_max": -0.03340035997028838,
"nauc_recall_at_20_std": -0.15989580035000697,
"nauc_recall_at_3_diff1": -0.17262000451913828,
"nauc_recall_at_3_max": -0.09571578057885666,
"nauc_recall_at_3_std": -0.3763182314028359,
"nauc_recall_at_5_diff1": -0.24128223078910635,
"nauc_recall_at_5_max": -0.19112327005932409,
"nauc_recall_at_5_std": -0.42004656522954104,
"ndcg_at_1": 0.04334,
"ndcg_at_10": 0.03882,
"ndcg_at_100": 0.07522,
"ndcg_at_1000": 0.19278,
"ndcg_at_20": 0.04494,
"ndcg_at_3": 0.0496,
"ndcg_at_5": 0.04318,
"precision_at_1": 0.04954,
"precision_at_10": 0.03344,
"precision_at_100": 0.02929,
"precision_at_1000": 0.01652,
"precision_at_20": 0.03591,
"precision_at_3": 0.0547,
"precision_at_5": 0.04211,
"recall_at_1": 0.00256,
"recall_at_10": 0.01199,
"recall_at_100": 0.123,
"recall_at_1000": 0.53108,
"recall_at_20": 0.02986,
"recall_at_3": 0.00564,
"recall_at_5": 0.00749
}
]
},
"task_name": "NFCorpus"
}
I had initially thought it might be an issue with the mteb package but I have had expected performance with other models the smaller model did not show unexpected performance. These results are:
Expand
{ "dataset_revision": "ec0fa4fe99da2ff19ca1214b7966684033a58814", "evaluation_time": 6.9981629848480225, "kg_co2_emissions": null, "mteb_version": "1.13.0", "scores": { "test": [ { "hf_subset": "default", "languages": [ "eng-Latn" ], "main_score": 0.26669, "map_at_1": 0.04302, "map_at_10": 0.09495, "map_at_100": 0.11745, "map_at_1000": 0.12979, "map_at_20": 0.10425, "map_at_3": 0.07058, "map_at_5": 0.08291, "mrr_at_1": 0.3560371517027864, "mrr_at_10": 0.4341724409061872, "mrr_at_100": 0.442795364949832, "mrr_at_1000": 0.44339722303685203, "mrr_at_20": 0.4398766686404833, "mrr_at_3": 0.4097007223942209, "mrr_at_5": 0.4234778121775027, "nauc_map_at_1000_diff1": 0.2908534146180977, "nauc_map_at_1000_max": 0.0794375059635545, "nauc_map_at_1000_std": 0.14341115138794566, "nauc_map_at_100_diff1": 0.3011875220395871, "nauc_map_at_100_max": 0.07297178629654529, "nauc_map_at_100_std": 0.09064207472188912, "nauc_map_at_10_diff1": 0.32693215757923644, "nauc_map_at_10_max": 0.028457578311956038, "nauc_map_at_10_std": -0.011325318604211824, "nauc_map_at_1_diff1": 0.43510716229194757, "nauc_map_at_1_max": -0.09495833501538574, "nauc_map_at_1_std": -0.14781701100548014, "nauc_map_at_20_diff1": 0.31124269101574803, "nauc_map_at_20_max": 0.04954931654356847, "nauc_map_at_20_std": 0.030895354009790026, "nauc_map_at_3_diff1": 0.3604769289209484, "nauc_map_at_3_max": -0.027593643375550314, "nauc_map_at_3_std": -0.10192071038688266, "nauc_map_at_5_diff1": 0.3442867338018007, "nauc_map_at_5_max": -0.007973881046767651, "nauc_map_at_5_std": -0.07650131214272698, "nauc_mrr_at_1000_diff1": 0.3441296669476509, "nauc_mrr_at_1000_max": 0.1767284629322694, "nauc_mrr_at_1000_std": 0.22358432616149948, "nauc_mrr_at_100_diff1": 0.3439411376546323, "nauc_mrr_at_100_max": 0.17696296717939752, "nauc_mrr_at_100_std": 0.2240207609439034, "nauc_mrr_at_10_diff1": 0.3458664215847159, "nauc_mrr_at_10_max": 0.17667414171239298, "nauc_mrr_at_10_std": 0.22004223623552005, "nauc_mrr_at_1_diff1": 0.3641675562685909, "nauc_mrr_at_1_max": 0.1526656279467144, "nauc_mrr_at_1_std": 0.188276001581598, "nauc_mrr_at_20_diff1": 0.3441818023527764, "nauc_mrr_at_20_max": 0.17670207608274915, "nauc_mrr_at_20_std": 0.22417080504121126, "nauc_mrr_at_3_diff1": 0.33399349531541783, "nauc_mrr_at_3_max": 0.1432767101148197, "nauc_mrr_at_3_std": 0.196059412307152, "nauc_mrr_at_5_diff1": 0.3446366507012419, "nauc_mrr_at_5_max": 0.16334209662497975, "nauc_mrr_at_5_std": 0.203338879990243, "nauc_ndcg_at_1000_diff1": 0.29810550938241237, "nauc_ndcg_at_1000_max": 0.22622966209430026, "nauc_ndcg_at_1000_std": 0.30056225696964384, "nauc_ndcg_at_100_diff1": 0.28834449913390753, "nauc_ndcg_at_100_max": 0.172117327948967, "nauc_ndcg_at_100_std": 0.2597186204310389, "nauc_ndcg_at_10_diff1": 0.2413569781287051, "nauc_ndcg_at_10_max": 0.1502855578135408, "nauc_ndcg_at_10_std": 0.2878538091719135, "nauc_ndcg_at_1_diff1": 0.35948388778268686, "nauc_ndcg_at_1_max": 0.13202607066740898, "nauc_ndcg_at_1_std": 0.1779720342529562, "nauc_ndcg_at_20_diff1": 0.2347932992404398, "nauc_ndcg_at_20_max": 0.14300959447198772, "nauc_ndcg_at_20_std": 0.29657349855437265, "nauc_ndcg_at_3_diff1": 0.2626965790710133, "nauc_ndcg_at_3_max": 0.13594596222426555, "nauc_ndcg_at_3_std": 0.20813401730033873, "nauc_ndcg_at_5_diff1": 0.25278782877740325, "nauc_ndcg_at_5_max": 0.13474911774061052, "nauc_ndcg_at_5_std": 0.2350442820314744, "nauc_precision_at_1000_diff1": -0.08602352848674248, "nauc_precision_at_1000_max": 0.024870204006624468, "nauc_precision_at_1000_std": 0.5635045022163561, "nauc_precision_at_100_diff1": -0.03670090334213061, "nauc_precision_at_100_max": 0.12263803560765828, "nauc_precision_at_100_std": 0.6191278682973843, "nauc_precision_at_10_diff1": 0.09802328391260658, "nauc_precision_at_10_max": 0.20638275438526946, "nauc_precision_at_10_std": 0.4479278301765283, "nauc_precision_at_1_diff1": 0.3641675562685909, "nauc_precision_at_1_max": 0.1526656279467144, "nauc_precision_at_1_std": 0.188276001581598, "nauc_precision_at_20_diff1": 0.03046256960974856, "nauc_precision_at_20_max": 0.19133433096277322, "nauc_precision_at_20_std": 0.5306366171192535, "nauc_precision_at_3_diff1": 0.20209780757568416, "nauc_precision_at_3_max": 0.18154657805882843, "nauc_precision_at_3_std": 0.2590071101397448, "nauc_precision_at_5_diff1": 0.15568166474094394, "nauc_precision_at_5_max": 0.17807082498869553, "nauc_precision_at_5_std": 0.32559535072820006, "nauc_recall_at_1000_diff1": 0.15997389277635088, "nauc_recall_at_1000_max": 0.10204373295974262, "nauc_recall_at_1000_std": 0.13314977700491887, "nauc_recall_at_100_diff1": 0.2291028694054851, "nauc_recall_at_100_max": 0.13583290131936082, "nauc_recall_at_100_std": 0.0801423337284215, "nauc_recall_at_10_diff1": 0.3141397411384742, "nauc_recall_at_10_max": 0.0767846369013392, "nauc_recall_at_10_std": -0.006720561776378537, "nauc_recall_at_1_diff1": 0.43510716229194757, "nauc_recall_at_1_max": -0.09495833501538574, "nauc_recall_at_1_std": -0.14781701100548014, "nauc_recall_at_20_diff1": 0.27846485011566113, "nauc_recall_at_20_max": 0.09949530960785688, "nauc_recall_at_20_std": 0.03823685906686495, "nauc_recall_at_3_diff1": 0.3295437078085661, "nauc_recall_at_3_max": -0.0042534766531193945, "nauc_recall_at_3_std": -0.0999311807115119, "nauc_recall_at_5_diff1": 0.3202247786527721, "nauc_recall_at_5_max": 0.036029596413288824, "nauc_recall_at_5_std": -0.07887198762206853, "ndcg_at_1": 0.34211, "ndcg_at_10": 0.26669, "ndcg_at_100": 0.24748, "ndcg_at_1000": 0.3381, "ndcg_at_20": 0.24827, "ndcg_at_3": 0.31097, "ndcg_at_5": 0.29408, "precision_at_1": 0.35604, "precision_at_10": 0.19876, "precision_at_100": 0.06446, "precision_at_1000": 0.01949, "precision_at_20": 0.14861, "precision_at_3": 0.29102, "precision_at_5": 0.25573, "recall_at_1": 0.04302, "recall_at_10": 0.12511, "recall_at_100": 0.25124, "recall_at_1000": 0.57722, "recall_at_20": 0.15283, "recall_at_3": 0.08105, "recall_at_5": 0.10272 } ] }, "task_name": "NFCorpus" }Unfortunately I don't see what's causing your issue from your code, largely due to unfamiliarity with the packages you're using (sentence transformers and the MTEB package).
From your example code, it is not explicit that you're using the query instruction prefix properly when embedding queries, so that's one idea of something to double check, but I would be surprised if that's the cause of the problem because the M and S models should share the same sentence transformers configuration.
If you are primarily interested in running MTEB evaluations, we may be able to include in our README a script for running these evaluations the way we run them. Unfortunately recent updates to the MTEB package have messed up our original script, though, so this may take some time to update our script and fix it.
Oh, also it's probably also worth highlighting that the S model also scores around 0.32 (not 0.27) on NFCorpus when run correctly, so it seems there is generally something off with the way your are running the models.
@nsheikh Did you figure out your issue yet?