|
--- |
|
tags: |
|
- mteb |
|
model-index: |
|
- name: Dmeta-embedding-zh-small |
|
results: |
|
- task: |
|
type: STS |
|
dataset: |
|
type: C-MTEB/AFQMC |
|
name: MTEB AFQMC |
|
config: default |
|
split: validation |
|
revision: None |
|
metrics: |
|
- type: cos_sim_pearson |
|
value: 55.38441014851534 |
|
- type: cos_sim_spearman |
|
value: 59.54284362578262 |
|
- type: euclidean_pearson |
|
value: 58.18592108890414 |
|
- type: euclidean_spearman |
|
value: 59.54284362133902 |
|
- type: manhattan_pearson |
|
value: 58.142197046175916 |
|
- type: manhattan_spearman |
|
value: 59.47943468645265 |
|
- task: |
|
type: STS |
|
dataset: |
|
type: C-MTEB/ATEC |
|
name: MTEB ATEC |
|
config: default |
|
split: test |
|
revision: None |
|
metrics: |
|
- type: cos_sim_pearson |
|
value: 55.96911621560259 |
|
- type: cos_sim_spearman |
|
value: 58.6334496101353 |
|
- type: euclidean_pearson |
|
value: 62.78426382809823 |
|
- type: euclidean_spearman |
|
value: 58.63344961011331 |
|
- type: manhattan_pearson |
|
value: 62.80625401678188 |
|
- type: manhattan_spearman |
|
value: 58.618722128260394 |
|
- task: |
|
type: Classification |
|
dataset: |
|
type: mteb/amazon_reviews_multi |
|
name: MTEB AmazonReviewsClassification (zh) |
|
config: zh |
|
split: test |
|
revision: 1399c76144fd37290681b995c656ef9b2e06e26d |
|
metrics: |
|
- type: accuracy |
|
value: 44.88 |
|
- type: f1 |
|
value: 42.739249460584375 |
|
- task: |
|
type: STS |
|
dataset: |
|
type: C-MTEB/BQ |
|
name: MTEB BQ |
|
config: default |
|
split: test |
|
revision: None |
|
metrics: |
|
- type: cos_sim_pearson |
|
value: 68.56815521242152 |
|
- type: cos_sim_spearman |
|
value: 70.30776353631751 |
|
- type: euclidean_pearson |
|
value: 69.10087719019191 |
|
- type: euclidean_spearman |
|
value: 70.30775660748148 |
|
- type: manhattan_pearson |
|
value: 69.0672710967445 |
|
- type: manhattan_spearman |
|
value: 70.31940638148254 |
|
- task: |
|
type: Clustering |
|
dataset: |
|
type: C-MTEB/CLSClusteringP2P |
|
name: MTEB CLSClusteringP2P |
|
config: default |
|
split: test |
|
revision: None |
|
metrics: |
|
- type: v_measure |
|
value: 40.7861976704356 |
|
- task: |
|
type: Clustering |
|
dataset: |
|
type: C-MTEB/CLSClusteringS2S |
|
name: MTEB CLSClusteringS2S |
|
config: default |
|
split: test |
|
revision: None |
|
metrics: |
|
- type: v_measure |
|
value: 38.43028280281822 |
|
- task: |
|
type: Reranking |
|
dataset: |
|
type: C-MTEB/CMedQAv1-reranking |
|
name: MTEB CMedQAv1 |
|
config: default |
|
split: test |
|
revision: None |
|
metrics: |
|
- type: map |
|
value: 86.78386695617407 |
|
- type: mrr |
|
value: 88.79857142857142 |
|
- task: |
|
type: Reranking |
|
dataset: |
|
type: C-MTEB/CMedQAv2-reranking |
|
name: MTEB CMedQAv2 |
|
config: default |
|
split: test |
|
revision: None |
|
metrics: |
|
- type: map |
|
value: 87.38582377194436 |
|
- type: mrr |
|
value: 89.17158730158731 |
|
- task: |
|
type: Retrieval |
|
dataset: |
|
type: C-MTEB/CmedqaRetrieval |
|
name: MTEB CmedqaRetrieval |
|
config: default |
|
split: dev |
|
revision: None |
|
metrics: |
|
- type: map_at_1 |
|
value: 23.746000000000002 |
|
- type: map_at_10 |
|
value: 35.952 |
|
- type: map_at_100 |
|
value: 37.946999999999996 |
|
- type: map_at_1000 |
|
value: 38.059 |
|
- type: map_at_3 |
|
value: 31.680999999999997 |
|
- type: map_at_5 |
|
value: 34.046 |
|
- type: mrr_at_1 |
|
value: 36.409000000000006 |
|
- type: mrr_at_10 |
|
value: 44.801 |
|
- type: mrr_at_100 |
|
value: 45.842 |
|
- type: mrr_at_1000 |
|
value: 45.885999999999996 |
|
- type: mrr_at_3 |
|
value: 42.081 |
|
- type: mrr_at_5 |
|
value: 43.613 |
|
- type: ndcg_at_1 |
|
value: 36.409000000000006 |
|
- type: ndcg_at_10 |
|
value: 42.687000000000005 |
|
- type: ndcg_at_100 |
|
value: 50.352 |
|
- type: ndcg_at_1000 |
|
value: 52.275000000000006 |
|
- type: ndcg_at_3 |
|
value: 37.113 |
|
- type: ndcg_at_5 |
|
value: 39.434000000000005 |
|
- type: precision_at_1 |
|
value: 36.409000000000006 |
|
- type: precision_at_10 |
|
value: 9.712 |
|
- type: precision_at_100 |
|
value: 1.584 |
|
- type: precision_at_1000 |
|
value: 0.182 |
|
- type: precision_at_3 |
|
value: 21.096999999999998 |
|
- type: precision_at_5 |
|
value: 15.498999999999999 |
|
- type: recall_at_1 |
|
value: 23.746000000000002 |
|
- type: recall_at_10 |
|
value: 53.596 |
|
- type: recall_at_100 |
|
value: 85.232 |
|
- type: recall_at_1000 |
|
value: 98.092 |
|
- type: recall_at_3 |
|
value: 37.226 |
|
- type: recall_at_5 |
|
value: 44.187 |
|
- task: |
|
type: PairClassification |
|
dataset: |
|
type: C-MTEB/CMNLI |
|
name: MTEB Cmnli |
|
config: default |
|
split: validation |
|
revision: None |
|
metrics: |
|
- type: cos_sim_accuracy |
|
value: 82.66987372218881 |
|
- type: cos_sim_ap |
|
value: 90.28715189799232 |
|
- type: cos_sim_f1 |
|
value: 84.108318049412 |
|
- type: cos_sim_precision |
|
value: 78.0849358974359 |
|
- type: cos_sim_recall |
|
value: 91.13864858545709 |
|
- type: dot_accuracy |
|
value: 82.66987372218881 |
|
- type: dot_ap |
|
value: 90.29346021403634 |
|
- type: dot_f1 |
|
value: 84.108318049412 |
|
- type: dot_precision |
|
value: 78.0849358974359 |
|
- type: dot_recall |
|
value: 91.13864858545709 |
|
- type: euclidean_accuracy |
|
value: 82.66987372218881 |
|
- type: euclidean_ap |
|
value: 90.28656734732074 |
|
- type: euclidean_f1 |
|
value: 84.108318049412 |
|
- type: euclidean_precision |
|
value: 78.0849358974359 |
|
- type: euclidean_recall |
|
value: 91.13864858545709 |
|
- type: manhattan_accuracy |
|
value: 82.70595309681299 |
|
- type: manhattan_ap |
|
value: 90.25413574022456 |
|
- type: manhattan_f1 |
|
value: 83.9924670433145 |
|
- type: manhattan_precision |
|
value: 79.81052631578947 |
|
- type: manhattan_recall |
|
value: 88.63689501987373 |
|
- type: max_accuracy |
|
value: 82.70595309681299 |
|
- type: max_ap |
|
value: 90.29346021403634 |
|
- type: max_f1 |
|
value: 84.108318049412 |
|
- task: |
|
type: Retrieval |
|
dataset: |
|
type: C-MTEB/CovidRetrieval |
|
name: MTEB CovidRetrieval |
|
config: default |
|
split: dev |
|
revision: None |
|
metrics: |
|
- type: map_at_1 |
|
value: 61.38 |
|
- type: map_at_10 |
|
value: 71.23 |
|
- type: map_at_100 |
|
value: 71.61800000000001 |
|
- type: map_at_1000 |
|
value: 71.63000000000001 |
|
- type: map_at_3 |
|
value: 69.31 |
|
- type: map_at_5 |
|
value: 70.403 |
|
- type: mrr_at_1 |
|
value: 61.538000000000004 |
|
- type: mrr_at_10 |
|
value: 71.28999999999999 |
|
- type: mrr_at_100 |
|
value: 71.666 |
|
- type: mrr_at_1000 |
|
value: 71.678 |
|
- type: mrr_at_3 |
|
value: 69.44200000000001 |
|
- type: mrr_at_5 |
|
value: 70.506 |
|
- type: ndcg_at_1 |
|
value: 61.538000000000004 |
|
- type: ndcg_at_10 |
|
value: 75.626 |
|
- type: ndcg_at_100 |
|
value: 77.449 |
|
- type: ndcg_at_1000 |
|
value: 77.73400000000001 |
|
- type: ndcg_at_3 |
|
value: 71.75200000000001 |
|
- type: ndcg_at_5 |
|
value: 73.695 |
|
- type: precision_at_1 |
|
value: 61.538000000000004 |
|
- type: precision_at_10 |
|
value: 9.009 |
|
- type: precision_at_100 |
|
value: 0.9860000000000001 |
|
- type: precision_at_1000 |
|
value: 0.101 |
|
- type: precision_at_3 |
|
value: 26.379 |
|
- type: precision_at_5 |
|
value: 16.797 |
|
- type: recall_at_1 |
|
value: 61.38 |
|
- type: recall_at_10 |
|
value: 89.199 |
|
- type: recall_at_100 |
|
value: 97.576 |
|
- type: recall_at_1000 |
|
value: 99.789 |
|
- type: recall_at_3 |
|
value: 78.635 |
|
- type: recall_at_5 |
|
value: 83.325 |
|
- task: |
|
type: Retrieval |
|
dataset: |
|
type: C-MTEB/DuRetrieval |
|
name: MTEB DuRetrieval |
|
config: default |
|
split: dev |
|
revision: None |
|
metrics: |
|
- type: map_at_1 |
|
value: 23.067 |
|
- type: map_at_10 |
|
value: 70.658 |
|
- type: map_at_100 |
|
value: 73.85300000000001 |
|
- type: map_at_1000 |
|
value: 73.925 |
|
- type: map_at_3 |
|
value: 48.391 |
|
- type: map_at_5 |
|
value: 61.172000000000004 |
|
- type: mrr_at_1 |
|
value: 83.1 |
|
- type: mrr_at_10 |
|
value: 88.214 |
|
- type: mrr_at_100 |
|
value: 88.298 |
|
- type: mrr_at_1000 |
|
value: 88.304 |
|
- type: mrr_at_3 |
|
value: 87.717 |
|
- type: mrr_at_5 |
|
value: 88.03699999999999 |
|
- type: ndcg_at_1 |
|
value: 83.1 |
|
- type: ndcg_at_10 |
|
value: 79.89 |
|
- type: ndcg_at_100 |
|
value: 83.829 |
|
- type: ndcg_at_1000 |
|
value: 84.577 |
|
- type: ndcg_at_3 |
|
value: 78.337 |
|
- type: ndcg_at_5 |
|
value: 77.224 |
|
- type: precision_at_1 |
|
value: 83.1 |
|
- type: precision_at_10 |
|
value: 38.934999999999995 |
|
- type: precision_at_100 |
|
value: 4.6690000000000005 |
|
- type: precision_at_1000 |
|
value: 0.484 |
|
- type: precision_at_3 |
|
value: 70.48299999999999 |
|
- type: precision_at_5 |
|
value: 59.68 |
|
- type: recall_at_1 |
|
value: 23.067 |
|
- type: recall_at_10 |
|
value: 81.702 |
|
- type: recall_at_100 |
|
value: 94.214 |
|
- type: recall_at_1000 |
|
value: 98.241 |
|
- type: recall_at_3 |
|
value: 51.538 |
|
- type: recall_at_5 |
|
value: 67.39 |
|
- task: |
|
type: Retrieval |
|
dataset: |
|
type: C-MTEB/EcomRetrieval |
|
name: MTEB EcomRetrieval |
|
config: default |
|
split: dev |
|
revision: None |
|
metrics: |
|
- type: map_at_1 |
|
value: 49.8 |
|
- type: map_at_10 |
|
value: 59.46399999999999 |
|
- type: map_at_100 |
|
value: 60.063 |
|
- type: map_at_1000 |
|
value: 60.08 |
|
- type: map_at_3 |
|
value: 56.833 |
|
- type: map_at_5 |
|
value: 58.438 |
|
- type: mrr_at_1 |
|
value: 49.8 |
|
- type: mrr_at_10 |
|
value: 59.46399999999999 |
|
- type: mrr_at_100 |
|
value: 60.063 |
|
- type: mrr_at_1000 |
|
value: 60.08 |
|
- type: mrr_at_3 |
|
value: 56.833 |
|
- type: mrr_at_5 |
|
value: 58.438 |
|
- type: ndcg_at_1 |
|
value: 49.8 |
|
- type: ndcg_at_10 |
|
value: 64.48 |
|
- type: ndcg_at_100 |
|
value: 67.314 |
|
- type: ndcg_at_1000 |
|
value: 67.745 |
|
- type: ndcg_at_3 |
|
value: 59.06400000000001 |
|
- type: ndcg_at_5 |
|
value: 61.973 |
|
- type: precision_at_1 |
|
value: 49.8 |
|
- type: precision_at_10 |
|
value: 8.04 |
|
- type: precision_at_100 |
|
value: 0.935 |
|
- type: precision_at_1000 |
|
value: 0.097 |
|
- type: precision_at_3 |
|
value: 21.833 |
|
- type: precision_at_5 |
|
value: 14.52 |
|
- type: recall_at_1 |
|
value: 49.8 |
|
- type: recall_at_10 |
|
value: 80.4 |
|
- type: recall_at_100 |
|
value: 93.5 |
|
- type: recall_at_1000 |
|
value: 96.8 |
|
- type: recall_at_3 |
|
value: 65.5 |
|
- type: recall_at_5 |
|
value: 72.6 |
|
- task: |
|
type: Classification |
|
dataset: |
|
type: C-MTEB/IFlyTek-classification |
|
name: MTEB IFlyTek |
|
config: default |
|
split: validation |
|
revision: None |
|
metrics: |
|
- type: accuracy |
|
value: 49.111196614082345 |
|
- type: f1 |
|
value: 37.07930546974089 |
|
- task: |
|
type: Classification |
|
dataset: |
|
type: C-MTEB/JDReview-classification |
|
name: MTEB JDReview |
|
config: default |
|
split: test |
|
revision: None |
|
metrics: |
|
- type: accuracy |
|
value: 85.57223264540339 |
|
- type: ap |
|
value: 53.30690968994808 |
|
- type: f1 |
|
value: 80.20587062271773 |
|
- task: |
|
type: STS |
|
dataset: |
|
type: C-MTEB/LCQMC |
|
name: MTEB LCQMC |
|
config: default |
|
split: test |
|
revision: None |
|
metrics: |
|
- type: cos_sim_pearson |
|
value: 73.03085269274996 |
|
- type: cos_sim_spearman |
|
value: 78.72837937949888 |
|
- type: euclidean_pearson |
|
value: 78.34911745798928 |
|
- type: euclidean_spearman |
|
value: 78.72838602779268 |
|
- type: manhattan_pearson |
|
value: 78.31833697617105 |
|
- type: manhattan_spearman |
|
value: 78.69603741566397 |
|
- task: |
|
type: Reranking |
|
dataset: |
|
type: C-MTEB/Mmarco-reranking |
|
name: MTEB MMarcoReranking |
|
config: default |
|
split: dev |
|
revision: None |
|
metrics: |
|
- type: map |
|
value: 27.391692468538416 |
|
- type: mrr |
|
value: 26.44682539682539 |
|
- task: |
|
type: Retrieval |
|
dataset: |
|
type: C-MTEB/MMarcoRetrieval |
|
name: MTEB MMarcoRetrieval |
|
config: default |
|
split: dev |
|
revision: None |
|
metrics: |
|
- type: map_at_1 |
|
value: 57.206999999999994 |
|
- type: map_at_10 |
|
value: 66.622 |
|
- type: map_at_100 |
|
value: 67.12700000000001 |
|
- type: map_at_1000 |
|
value: 67.145 |
|
- type: map_at_3 |
|
value: 64.587 |
|
- type: map_at_5 |
|
value: 65.827 |
|
- type: mrr_at_1 |
|
value: 59.312 |
|
- type: mrr_at_10 |
|
value: 67.387 |
|
- type: mrr_at_100 |
|
value: 67.836 |
|
- type: mrr_at_1000 |
|
value: 67.851 |
|
- type: mrr_at_3 |
|
value: 65.556 |
|
- type: mrr_at_5 |
|
value: 66.66 |
|
- type: ndcg_at_1 |
|
value: 59.312 |
|
- type: ndcg_at_10 |
|
value: 70.748 |
|
- type: ndcg_at_100 |
|
value: 73.076 |
|
- type: ndcg_at_1000 |
|
value: 73.559 |
|
- type: ndcg_at_3 |
|
value: 66.81200000000001 |
|
- type: ndcg_at_5 |
|
value: 68.92399999999999 |
|
- type: precision_at_1 |
|
value: 59.312 |
|
- type: precision_at_10 |
|
value: 8.798 |
|
- type: precision_at_100 |
|
value: 0.996 |
|
- type: precision_at_1000 |
|
value: 0.104 |
|
- type: precision_at_3 |
|
value: 25.487 |
|
- type: precision_at_5 |
|
value: 16.401 |
|
- type: recall_at_1 |
|
value: 57.206999999999994 |
|
- type: recall_at_10 |
|
value: 82.767 |
|
- type: recall_at_100 |
|
value: 93.449 |
|
- type: recall_at_1000 |
|
value: 97.262 |
|
- type: recall_at_3 |
|
value: 72.271 |
|
- type: recall_at_5 |
|
value: 77.291 |
|
- task: |
|
type: Classification |
|
dataset: |
|
type: mteb/amazon_massive_intent |
|
name: MTEB MassiveIntentClassification (zh-CN) |
|
config: zh-CN |
|
split: test |
|
revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 |
|
metrics: |
|
- type: accuracy |
|
value: 70.78345662407531 |
|
- type: f1 |
|
value: 68.35683436974351 |
|
- task: |
|
type: Classification |
|
dataset: |
|
type: mteb/amazon_massive_scenario |
|
name: MTEB MassiveScenarioClassification (zh-CN) |
|
config: zh-CN |
|
split: test |
|
revision: 7d571f92784cd94a019292a1f45445077d0ef634 |
|
metrics: |
|
- type: accuracy |
|
value: 73.16408876933423 |
|
- type: f1 |
|
value: 73.31484873459382 |
|
- task: |
|
type: Retrieval |
|
dataset: |
|
type: C-MTEB/MedicalRetrieval |
|
name: MTEB MedicalRetrieval |
|
config: default |
|
split: dev |
|
revision: None |
|
metrics: |
|
- type: map_at_1 |
|
value: 51.4 |
|
- type: map_at_10 |
|
value: 57.091 |
|
- type: map_at_100 |
|
value: 57.652 |
|
- type: map_at_1000 |
|
value: 57.703 |
|
- type: map_at_3 |
|
value: 55.733 |
|
- type: map_at_5 |
|
value: 56.363 |
|
- type: mrr_at_1 |
|
value: 51.7 |
|
- type: mrr_at_10 |
|
value: 57.243 |
|
- type: mrr_at_100 |
|
value: 57.80499999999999 |
|
- type: mrr_at_1000 |
|
value: 57.855999999999995 |
|
- type: mrr_at_3 |
|
value: 55.883 |
|
- type: mrr_at_5 |
|
value: 56.513000000000005 |
|
- type: ndcg_at_1 |
|
value: 51.4 |
|
- type: ndcg_at_10 |
|
value: 59.948 |
|
- type: ndcg_at_100 |
|
value: 63.064 |
|
- type: ndcg_at_1000 |
|
value: 64.523 |
|
- type: ndcg_at_3 |
|
value: 57.089999999999996 |
|
- type: ndcg_at_5 |
|
value: 58.214 |
|
- type: precision_at_1 |
|
value: 51.4 |
|
- type: precision_at_10 |
|
value: 6.9 |
|
- type: precision_at_100 |
|
value: 0.845 |
|
- type: precision_at_1000 |
|
value: 0.096 |
|
- type: precision_at_3 |
|
value: 20.333000000000002 |
|
- type: precision_at_5 |
|
value: 12.740000000000002 |
|
- type: recall_at_1 |
|
value: 51.4 |
|
- type: recall_at_10 |
|
value: 69.0 |
|
- type: recall_at_100 |
|
value: 84.5 |
|
- type: recall_at_1000 |
|
value: 96.2 |
|
- type: recall_at_3 |
|
value: 61.0 |
|
- type: recall_at_5 |
|
value: 63.7 |
|
- task: |
|
type: Classification |
|
dataset: |
|
type: C-MTEB/MultilingualSentiment-classification |
|
name: MTEB MultilingualSentiment |
|
config: default |
|
split: validation |
|
revision: None |
|
metrics: |
|
- type: accuracy |
|
value: 74.38999999999999 |
|
- type: f1 |
|
value: 74.07161306140839 |
|
- task: |
|
type: PairClassification |
|
dataset: |
|
type: C-MTEB/OCNLI |
|
name: MTEB Ocnli |
|
config: default |
|
split: validation |
|
revision: None |
|
metrics: |
|
- type: cos_sim_accuracy |
|
value: 81.15863562533838 |
|
- type: cos_sim_ap |
|
value: 84.84571607908443 |
|
- type: cos_sim_f1 |
|
value: 82.55872063968016 |
|
- type: cos_sim_precision |
|
value: 78.36812144212524 |
|
- type: cos_sim_recall |
|
value: 87.22280887011615 |
|
- type: dot_accuracy |
|
value: 81.15863562533838 |
|
- type: dot_ap |
|
value: 84.84571607908443 |
|
- type: dot_f1 |
|
value: 82.55872063968016 |
|
- type: dot_precision |
|
value: 78.36812144212524 |
|
- type: dot_recall |
|
value: 87.22280887011615 |
|
- type: euclidean_accuracy |
|
value: 81.15863562533838 |
|
- type: euclidean_ap |
|
value: 84.84571607908443 |
|
- type: euclidean_f1 |
|
value: 82.55872063968016 |
|
- type: euclidean_precision |
|
value: 78.36812144212524 |
|
- type: euclidean_recall |
|
value: 87.22280887011615 |
|
- type: manhattan_accuracy |
|
value: 80.7796426637791 |
|
- type: manhattan_ap |
|
value: 84.81524098914134 |
|
- type: manhattan_f1 |
|
value: 82.36462990561351 |
|
- type: manhattan_precision |
|
value: 77.76735459662288 |
|
- type: manhattan_recall |
|
value: 87.53959873284055 |
|
- type: max_accuracy |
|
value: 81.15863562533838 |
|
- type: max_ap |
|
value: 84.84571607908443 |
|
- type: max_f1 |
|
value: 82.55872063968016 |
|
- task: |
|
type: Classification |
|
dataset: |
|
type: C-MTEB/OnlineShopping-classification |
|
name: MTEB OnlineShopping |
|
config: default |
|
split: test |
|
revision: None |
|
metrics: |
|
- type: accuracy |
|
value: 93.12000000000002 |
|
- type: ap |
|
value: 91.0749103088623 |
|
- type: f1 |
|
value: 93.10837266607813 |
|
- task: |
|
type: STS |
|
dataset: |
|
type: C-MTEB/PAWSX |
|
name: MTEB PAWSX |
|
config: default |
|
split: test |
|
revision: None |
|
metrics: |
|
- type: cos_sim_pearson |
|
value: 38.5692290188029 |
|
- type: cos_sim_spearman |
|
value: 42.965264868554335 |
|
- type: euclidean_pearson |
|
value: 43.002526263615735 |
|
- type: euclidean_spearman |
|
value: 42.97561576045246 |
|
- type: manhattan_pearson |
|
value: 43.050089639788936 |
|
- type: manhattan_spearman |
|
value: 43.038497558804934 |
|
- task: |
|
type: STS |
|
dataset: |
|
type: C-MTEB/QBQTC |
|
name: MTEB QBQTC |
|
config: default |
|
split: test |
|
revision: None |
|
metrics: |
|
- type: cos_sim_pearson |
|
value: 38.99284895602663 |
|
- type: cos_sim_spearman |
|
value: 41.02655813481606 |
|
- type: euclidean_pearson |
|
value: 38.934953519378354 |
|
- type: euclidean_spearman |
|
value: 41.02680077136343 |
|
- type: manhattan_pearson |
|
value: 39.224809609807785 |
|
- type: manhattan_spearman |
|
value: 41.13950779185706 |
|
- task: |
|
type: STS |
|
dataset: |
|
type: mteb/sts22-crosslingual-sts |
|
name: MTEB STS22 (zh) |
|
config: zh |
|
split: test |
|
revision: 6d1ba47164174a496b7fa5d3569dae26a6813b80 |
|
metrics: |
|
- type: cos_sim_pearson |
|
value: 66.47464607633356 |
|
- type: cos_sim_spearman |
|
value: 66.76311382148693 |
|
- type: euclidean_pearson |
|
value: 67.25180409604143 |
|
- type: euclidean_spearman |
|
value: 66.76311382148693 |
|
- type: manhattan_pearson |
|
value: 67.6928257682864 |
|
- type: manhattan_spearman |
|
value: 67.08172581019826 |
|
- task: |
|
type: STS |
|
dataset: |
|
type: C-MTEB/STSB |
|
name: MTEB STSB |
|
config: default |
|
split: test |
|
revision: None |
|
metrics: |
|
- type: cos_sim_pearson |
|
value: 77.48943840585562 |
|
- type: cos_sim_spearman |
|
value: 79.0869194735025 |
|
- type: euclidean_pearson |
|
value: 79.48559575794792 |
|
- type: euclidean_spearman |
|
value: 79.08765044225807 |
|
- type: manhattan_pearson |
|
value: 79.36157224751007 |
|
- type: manhattan_spearman |
|
value: 78.94400905463999 |
|
- task: |
|
type: Reranking |
|
dataset: |
|
type: C-MTEB/T2Reranking |
|
name: MTEB T2Reranking |
|
config: default |
|
split: dev |
|
revision: None |
|
metrics: |
|
- type: map |
|
value: 66.1093201711458 |
|
- type: mrr |
|
value: 75.70959742506797 |
|
- task: |
|
type: Retrieval |
|
dataset: |
|
type: C-MTEB/T2Retrieval |
|
name: MTEB T2Retrieval |
|
config: default |
|
split: dev |
|
revision: None |
|
metrics: |
|
- type: map_at_1 |
|
value: 25.533 |
|
- type: map_at_10 |
|
value: 71.322 |
|
- type: map_at_100 |
|
value: 75.244 |
|
- type: map_at_1000 |
|
value: 75.333 |
|
- type: map_at_3 |
|
value: 50.15500000000001 |
|
- type: map_at_5 |
|
value: 61.514 |
|
- type: mrr_at_1 |
|
value: 86.126 |
|
- type: mrr_at_10 |
|
value: 89.462 |
|
- type: mrr_at_100 |
|
value: 89.58500000000001 |
|
- type: mrr_at_1000 |
|
value: 89.59 |
|
- type: mrr_at_3 |
|
value: 88.88000000000001 |
|
- type: mrr_at_5 |
|
value: 89.241 |
|
- type: ndcg_at_1 |
|
value: 86.126 |
|
- type: ndcg_at_10 |
|
value: 79.89500000000001 |
|
- type: ndcg_at_100 |
|
value: 84.405 |
|
- type: ndcg_at_1000 |
|
value: 85.286 |
|
- type: ndcg_at_3 |
|
value: 81.547 |
|
- type: ndcg_at_5 |
|
value: 79.834 |
|
- type: precision_at_1 |
|
value: 86.126 |
|
- type: precision_at_10 |
|
value: 39.972 |
|
- type: precision_at_100 |
|
value: 4.932 |
|
- type: precision_at_1000 |
|
value: 0.514 |
|
- type: precision_at_3 |
|
value: 71.49 |
|
- type: precision_at_5 |
|
value: 59.687 |
|
- type: recall_at_1 |
|
value: 25.533 |
|
- type: recall_at_10 |
|
value: 78.962 |
|
- type: recall_at_100 |
|
value: 93.413 |
|
- type: recall_at_1000 |
|
value: 97.89099999999999 |
|
- type: recall_at_3 |
|
value: 52.129000000000005 |
|
- type: recall_at_5 |
|
value: 65.444 |
|
- task: |
|
type: Classification |
|
dataset: |
|
type: C-MTEB/TNews-classification |
|
name: MTEB TNews |
|
config: default |
|
split: validation |
|
revision: None |
|
metrics: |
|
- type: accuracy |
|
value: 51.800000000000004 |
|
- type: f1 |
|
value: 50.07807183704828 |
|
- task: |
|
type: Clustering |
|
dataset: |
|
type: C-MTEB/ThuNewsClusteringP2P |
|
name: MTEB ThuNewsClusteringP2P |
|
config: default |
|
split: test |
|
revision: None |
|
metrics: |
|
- type: v_measure |
|
value: 65.15253218390774 |
|
- task: |
|
type: Clustering |
|
dataset: |
|
type: C-MTEB/ThuNewsClusteringS2S |
|
name: MTEB ThuNewsClusteringS2S |
|
config: default |
|
split: test |
|
revision: None |
|
metrics: |
|
- type: v_measure |
|
value: 58.81779372506517 |
|
- task: |
|
type: Retrieval |
|
dataset: |
|
type: C-MTEB/VideoRetrieval |
|
name: MTEB VideoRetrieval |
|
config: default |
|
split: dev |
|
revision: None |
|
metrics: |
|
- type: map_at_1 |
|
value: 53.0 |
|
- type: map_at_10 |
|
value: 63.422999999999995 |
|
- type: map_at_100 |
|
value: 63.995000000000005 |
|
- type: map_at_1000 |
|
value: 64.004 |
|
- type: map_at_3 |
|
value: 61.382999999999996 |
|
- type: map_at_5 |
|
value: 62.488 |
|
- type: mrr_at_1 |
|
value: 53.0 |
|
- type: mrr_at_10 |
|
value: 63.422999999999995 |
|
- type: mrr_at_100 |
|
value: 63.995000000000005 |
|
- type: mrr_at_1000 |
|
value: 64.004 |
|
- type: mrr_at_3 |
|
value: 61.382999999999996 |
|
- type: mrr_at_5 |
|
value: 62.488 |
|
- type: ndcg_at_1 |
|
value: 53.0 |
|
- type: ndcg_at_10 |
|
value: 68.301 |
|
- type: ndcg_at_100 |
|
value: 70.988 |
|
- type: ndcg_at_1000 |
|
value: 71.294 |
|
- type: ndcg_at_3 |
|
value: 64.11 |
|
- type: ndcg_at_5 |
|
value: 66.094 |
|
- type: precision_at_1 |
|
value: 53.0 |
|
- type: precision_at_10 |
|
value: 8.35 |
|
- type: precision_at_100 |
|
value: 0.958 |
|
- type: precision_at_1000 |
|
value: 0.098 |
|
- type: precision_at_3 |
|
value: 24.0 |
|
- type: precision_at_5 |
|
value: 15.36 |
|
- type: recall_at_1 |
|
value: 53.0 |
|
- type: recall_at_10 |
|
value: 83.5 |
|
- type: recall_at_100 |
|
value: 95.8 |
|
- type: recall_at_1000 |
|
value: 98.3 |
|
- type: recall_at_3 |
|
value: 72.0 |
|
- type: recall_at_5 |
|
value: 76.8 |
|
- task: |
|
type: Classification |
|
dataset: |
|
type: C-MTEB/waimai-classification |
|
name: MTEB Waimai |
|
config: default |
|
split: test |
|
revision: None |
|
metrics: |
|
- type: accuracy |
|
value: 86.18 |
|
- type: ap |
|
value: 69.04229346593745 |
|
- type: f1 |
|
value: 84.52986739717021 |
|
license: apache-2.0 |
|
--- |
|
|
|
<div align="center"> |
|
<img src="logo.png" alt="icon" width="100px"/> |
|
</div> |
|
|
|
<h1 align="center">Dmeta-embedding-small</h1> |
|
|
|
- Dmeta-embedding系列模型是跨领域、跨任务、开箱即用的中文 Embedding 模型,适用于搜索、问答、智能客服、LLM+RAG 等各种业务场景,支持使用 Transformers/Sentence-Transformers/Langchain 等工具加载推理。 |
|
- **Dmeta-embedding-zh-small**是开源模型[Dmeta-embedding-zh](https://huggingface.co/DMetaSoul/Dmeta-embedding-zh)的蒸馏版本(8层BERT),模型大小不到300M。相较于原始版本,Dmeta-embedding-zh-small模型大小减小三分之一,推理速度提升约30%,总体精度下降约1.4%。 |
|
|
|
--- |
|
|
|
## Evaluation |
|
|
|
这里主要跟蒸馏前对应的 teacher 模型作了对比: |
|
|
|
*性能:*(基于1万条数据测试,GPU设备是V100) |
|
|
|
| | Teacher | Student | Gap | |
|
| ---------- | ------------------------- | ------------------------------ | ----- | |
|
| Model | Dmeta-Embedding-zh (411M) | Dmeta-Embedding-zh-small (297M)| 0.67x | |
|
| Cost | 127s | 89s | -30% | |
|
| Latency | 13ms | 9ms | -31% | |
|
| Throughput | 78 sentence/s | 111 sentence/s | 1.4x | |
|
|
|
|
|
*精度:*(参考自MTEB榜单) |
|
|
|
| | **Classification** | **Clustering** | **Pair Classification** | **Reranking** | **Retrieval** | **STS** | **Avg** | |
|
| ----------------------------- | ----------------- | -------------- | ----------------------- | ------------- | ------------- | ------- | ------- | |
|
| **Dmeta-Embedding-zh** | 70 | 50.96 | 88.92 | 67.17 | 70.41 | 64.89 | 67.51 | |
|
| **Dmeta-Embedding-zh-small** | 69.89 | 50.8 | 87.57 | 66.92 | 67.7 | 62.13 | 66.1 | |
|
| **Gap** | -0.11 | -0.16 | -1.35 | -0.25 | -2.71 | -2.76 | -1.41 | |
|
|
|
|
|
## Usage |
|
|
|
目前模型支持通过 [Sentence-Transformers](#sentence-transformers), [Langchain](#langchain), [Huggingface Transformers](#huggingface-transformers) 等主流框架进行推理,具体用法参考各个框架的示例。 |
|
|
|
### Sentence-Transformers |
|
|
|
Dmeta-embedding 模型支持通过 [sentence-transformers](https://www.SBERT.net) 来加载推理: |
|
|
|
``` |
|
pip install -U sentence-transformers |
|
``` |
|
|
|
```python |
|
from sentence_transformers import SentenceTransformer |
|
texts1 = ["胡子长得太快怎么办?", "在香港哪里买手表好"] |
|
texts2 = ["胡子长得快怎么办?", "怎样使胡子不浓密!", "香港买手表哪里好", "在杭州手机到哪里买"] |
|
model = SentenceTransformer('DMetaSoul/Dmeta-embedding-zh-small') |
|
embs1 = model.encode(texts1, normalize_embeddings=True) |
|
embs2 = model.encode(texts2, normalize_embeddings=True) |
|
# 计算两两相似度 |
|
similarity = embs1 @ embs2.T |
|
print(similarity) |
|
# 获取 texts1[i] 对应的最相似 texts2[j] |
|
for i in range(len(texts1)): |
|
scores = [] |
|
for j in range(len(texts2)): |
|
scores.append([texts2[j], similarity[i][j]]) |
|
scores = sorted(scores, key=lambda x:x[1], reverse=True) |
|
print(f"查询文本:{texts1[i]}") |
|
for text2, score in scores: |
|
print(f"相似文本:{text2},打分:{score}") |
|
print() |
|
``` |
|
|
|
示例输出如下: |
|
|
|
``` |
|
查询文本:胡子长得太快怎么办? |
|
相似文本:胡子长得快怎么办?,打分:0.965681254863739 |
|
相似文本:怎样使胡子不浓密!,打分:0.7353651523590088 |
|
相似文本:香港买手表哪里好,打分:0.24928246438503265 |
|
相似文本:在杭州手机到哪里买,打分:0.2038613110780716 |
|
|
|
查询文本:在香港哪里买手表好 |
|
相似文本:香港买手表哪里好,打分:0.9916468262672424 |
|
相似文本:在杭州手机到哪里买,打分:0.498248815536499 |
|
相似文本:胡子长得快怎么办?,打分:0.2424771636724472 |
|
相似文本:怎样使胡子不浓密!,打分:0.21715955436229706 |
|
``` |
|
|
|
### Langchain |
|
|
|
Dmeta-embedding 模型支持通过 LLM 工具框架 [langchain](https://www.langchain.com/) 来加载推理: |
|
|
|
``` |
|
pip install -U langchain |
|
``` |
|
|
|
```python |
|
import torch |
|
import numpy as np |
|
from langchain.embeddings import HuggingFaceEmbeddings |
|
model_name = "DMetaSoul/Dmeta-embedding-zh-small" |
|
model_kwargs = {'device': 'cuda' if torch.cuda.is_available() else 'cpu'} |
|
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity |
|
model = HuggingFaceEmbeddings( |
|
model_name=model_name, |
|
model_kwargs=model_kwargs, |
|
encode_kwargs=encode_kwargs, |
|
) |
|
texts1 = ["胡子长得太快怎么办?", "在香港哪里买手表好"] |
|
texts2 = ["胡子长得快怎么办?", "怎样使胡子不浓密!", "香港买手表哪里好", "在杭州手机到哪里买"] |
|
embs1 = model.embed_documents(texts1) |
|
embs2 = model.embed_documents(texts2) |
|
embs1, embs2 = np.array(embs1), np.array(embs2) |
|
# 计算两两相似度 |
|
similarity = embs1 @ embs2.T |
|
print(similarity) |
|
# 获取 texts1[i] 对应的最相似 texts2[j] |
|
for i in range(len(texts1)): |
|
scores = [] |
|
for j in range(len(texts2)): |
|
scores.append([texts2[j], similarity[i][j]]) |
|
scores = sorted(scores, key=lambda x:x[1], reverse=True) |
|
print(f"查询文本:{texts1[i]}") |
|
for text2, score in scores: |
|
print(f"相似文本:{text2},打分:{score}") |
|
print() |
|
``` |
|
|
|
### HuggingFace Transformers |
|
|
|
Dmeta-embedding 模型支持通过 [HuggingFace Transformers](https://huggingface.co/docs/transformers/index) 框架来加载推理: |
|
|
|
``` |
|
pip install -U transformers |
|
``` |
|
|
|
```python |
|
import torch |
|
from transformers import AutoTokenizer, AutoModel |
|
def mean_pooling(model_output, attention_mask): |
|
token_embeddings = model_output[0] #First element of model_output contains all token embeddings |
|
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() |
|
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) |
|
def cls_pooling(model_output): |
|
return model_output[0][:, 0] |
|
texts1 = ["胡子长得太快怎么办?", "在香港哪里买手表好"] |
|
texts2 = ["胡子长得快怎么办?", "怎样使胡子不浓密!", "香港买手表哪里好", "在杭州手机到哪里买"] |
|
tokenizer = AutoTokenizer.from_pretrained('DMetaSoul/Dmeta-embedding-zh-small') |
|
model = AutoModel.from_pretrained('DMetaSoul/Dmeta-embedding-zh-small') |
|
model.eval() |
|
with torch.no_grad(): |
|
inputs1 = tokenizer(texts1, padding=True, truncation=True, return_tensors='pt') |
|
inputs2 = tokenizer(texts2, padding=True, truncation=True, return_tensors='pt') |
|
model_output1 = model(**inputs1) |
|
model_output2 = model(**inputs2) |
|
embs1, embs2 = cls_pooling(model_output1), cls_pooling(model_output2) |
|
embs1 = torch.nn.functional.normalize(embs1, p=2, dim=1).numpy() |
|
embs2 = torch.nn.functional.normalize(embs2, p=2, dim=1).numpy() |
|
# 计算两两相似度 |
|
similarity = embs1 @ embs2.T |
|
print(similarity) |
|
# 获取 texts1[i] 对应的最相似 texts2[j] |
|
for i in range(len(texts1)): |
|
scores = [] |
|
for j in range(len(texts2)): |
|
scores.append([texts2[j], similarity[i][j]]) |
|
scores = sorted(scores, key=lambda x:x[1], reverse=True) |
|
print(f"查询文本:{texts1[i]}") |
|
for text2, score in scores: |
|
print(f"相似文本:{text2},打分:{score}") |
|
print() |
|
``` |
|
## Contact |
|
您如果在使用过程中,遇到任何问题,欢迎前往[讨论区](https://huggingface.co/DMetaSoul/Dmeta-embedding-zh-small/discussions)建言献策。 |
|
您也可以联系我们:赵中昊 <[email protected]>, 肖文斌 <[email protected]>, 孙凯 <[email protected]> |
|
同时我们也开通了微信群,可扫码加入我们(人数超200了,先加管理员再拉进群),一起共建 AIGC 技术生态! |
|
<image src="https://huggingface.co/DMetaSoul/Dmeta-embedding-zh-small/resolve/main/weixin.jpeg" style="display: block; margin-left: auto; margin-right: auto; width: 256px; height: 358px;"/> |
|
## License |
|
Dmeta-embedding 系列模型采用 Apache-2.0 License,开源模型可以进行免费商用私有部署。 |