|
--- |
|
license: apache-2.0 |
|
language: |
|
- en |
|
- zh |
|
library_name: transformers |
|
tags: |
|
- mteb |
|
- RAG-reranking |
|
model-index: |
|
- name: LdIR-reranker-large |
|
results: |
|
- task: |
|
type: Reranking |
|
dataset: |
|
type: C-MTEB/CMedQAv1-reranking |
|
name: MTEB CMedQAv1 |
|
config: default |
|
split: test |
|
revision: None |
|
metrics: |
|
- type: map |
|
value: 86.50438688414654 |
|
- type: mrr |
|
value: 88.91170634920635 |
|
- task: |
|
type: Reranking |
|
dataset: |
|
type: C-MTEB/CMedQAv2-reranking |
|
name: MTEB CMedQAv2 |
|
config: default |
|
split: test |
|
revision: None |
|
metrics: |
|
- type: map |
|
value: 87.10592353383732 |
|
- type: mrr |
|
value: 89.10178571428571 |
|
- task: |
|
type: Reranking |
|
dataset: |
|
type: C-MTEB/Mmarco-reranking |
|
name: MTEB MMarcoReranking |
|
config: default |
|
split: dev |
|
revision: None |
|
metrics: |
|
- type: map |
|
value: 39.354813242907133 |
|
- type: mrr |
|
value: 39.075793650793655 |
|
- task: |
|
type: Reranking |
|
dataset: |
|
type: C-MTEB/T2Reranking |
|
name: MTEB T2Reranking |
|
config: default |
|
split: dev |
|
revision: None |
|
metrics: |
|
- type: map |
|
value: 68.83696915006163 |
|
- type: mrr |
|
value: 79.77644651857584 |
|
--- |
|
|
|
## Introduction |
|
|
|
This model is a downstream task of [Qwen/Qwen2-1.5B](https://huggingface.co/Qwen/Qwen2-1.5B) . |
|
We leverage the work of [FlagEmbedding reranker](https://github.com/FlagOpen/FlagEmbedding/tree/master/FlagEmbedding/reranker) , |
|
and implement with Qwen2-1.5B as pretrained model. |
|
|
|
## Dependencies |
|
|
|
```text |
|
transformers==4.41.2 |
|
flash-attn==2.5.7 |
|
``` |
|
|
|
## Usage |
|
|
|
```python |
|
from typing import cast, List, Union, Tuple, Dict, Optional |
|
import numpy as np |
|
import torch |
|
from tqdm import tqdm |
|
import transformers |
|
from transformers import AutoTokenizer, PreTrainedModel, PreTrainedTokenizer, DataCollatorWithPadding |
|
from transformers.models.qwen2 import Qwen2Config, Qwen2ForSequenceClassification |
|
from transformers.trainer_pt_utils import LabelSmoother |
|
IGNORE_TOKEN_ID = LabelSmoother.ignore_index |
|
|
|
def preprocess( |
|
sources, |
|
tokenizer: transformers.PreTrainedTokenizer, |
|
max_len: int = 1024, |
|
) -> Dict: |
|
|
|
# Apply prompt templates |
|
input_ids, attention_masks = [], [] |
|
for i, source in enumerate(sources): |
|
messages = [ |
|
{"role": "user", |
|
"content": "\n\n".join(source)} |
|
] |
|
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
|
model_inputs = tokenizer([text]) |
|
input_id = model_inputs['input_ids'][0] |
|
attention_mask = model_inputs['attention_mask'][0] |
|
if len(input_id) > max_len: |
|
## last five tokens: <|im_end|>(151645), \n(198), <|im_start|>(151644), assistant(77091), \n(198) |
|
diff = len(input_id) - max_len |
|
input_id = input_id[:-5-diff] + input_id[-5:] |
|
attention_mask = attention_mask[:-5-diff] + attention_mask[-5:] |
|
assert len(input_id) == max_len |
|
input_ids.append(input_id) |
|
attention_masks.append(attention_mask) |
|
|
|
return dict( |
|
input_ids=input_ids, |
|
attention_mask=attention_masks |
|
) |
|
|
|
class FlagRerankerCustom: |
|
def __init__( |
|
self, |
|
model: PreTrainedModel, |
|
tokenizer: PreTrainedTokenizer, |
|
use_fp16: bool = False |
|
) -> None: |
|
self.tokenizer = tokenizer |
|
self.model = model |
|
self.data_collator = DataCollatorWithPadding(tokenizer=tokenizer) |
|
|
|
if torch.cuda.is_available(): |
|
self.device = torch.device('cuda') |
|
elif torch.backends.mps.is_available(): |
|
self.device = torch.device('mps') |
|
else: |
|
self.device = torch.device('cpu') |
|
use_fp16 = False |
|
if use_fp16: |
|
self.model.half() |
|
|
|
self.model = self.model.to(self.device) |
|
|
|
self.model.eval() |
|
|
|
self.num_gpus = torch.cuda.device_count() |
|
if self.num_gpus > 1: |
|
print(f"----------using {self.num_gpus}*GPUs----------") |
|
self.model = torch.nn.DataParallel(self.model) |
|
|
|
@torch.no_grad() |
|
def compute_score(self, sentence_pairs: Union[List[Tuple[str, str]], Tuple[str, str]], batch_size: int = 64, |
|
max_length: int = 1024) -> List[float]: |
|
|
|
if self.num_gpus > 0: |
|
batch_size = batch_size * self.num_gpus |
|
|
|
assert isinstance(sentence_pairs, list) |
|
if isinstance(sentence_pairs[0], str): |
|
sentence_pairs = [sentence_pairs] |
|
|
|
all_scores = [] |
|
for start_index in tqdm(range(0, len(sentence_pairs), batch_size), desc="Compute Scores", |
|
disable=True): |
|
sentences_batch = sentence_pairs[start_index:start_index + batch_size] |
|
inputs = preprocess(sources=sentences_batch, tokenizer=self.tokenizer, max_len=max_length) |
|
inputs = [dict(zip(inputs, t)) for t in zip(*inputs.values())] |
|
inputs = self.data_collator(inputs).to(self.device) |
|
scores = self.model(**inputs, return_dict=True).logits |
|
scores = scores.squeeze() |
|
all_scores.extend(scores.detach().to(torch.float).cpu().numpy().tolist()) |
|
|
|
if len(all_scores) == 1: |
|
return all_scores[0] |
|
return all_scores |
|
|
|
tokenizer = transformers.AutoTokenizer.from_pretrained( |
|
"neofung/LdIR-Qwen2-reranker-1.5B", |
|
padding_side="right", |
|
) |
|
|
|
config = Qwen2Config.from_pretrained( |
|
"neofung/LdIR-Qwen2-reranker-1.5B", |
|
trust_remote_code=True, |
|
bf16=True, |
|
) |
|
|
|
model = Qwen2ForSequenceClassification.from_pretrained( |
|
"neofung/LdIR-Qwen2-reranker-1.5B", |
|
config = config, |
|
trust_remote_code = True, |
|
) |
|
|
|
model = FlagRerankerCustom(model=model, tokenizer=tokenizer, use_fp16=False) |
|
|
|
pairs = [['what is panda?', 'hi'], ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']] |
|
|
|
model.compute_score(pairs) |
|
|
|
# [-2.655318021774292, 11.7670316696167] |
|
``` |
|
|
|
|
|
## Evaluation on C-MTEB |
|
|
|
```python |
|
|
|
from C_MTEB.tasks import * |
|
from mteb import MTEB |
|
|
|
save_name = "LdIR-Qwen2-reranker-1.5B" |
|
|
|
evaluation = MTEB( |
|
task_types=["Reranking"], task_langs=['zh', 'zh2en', 'en2zh'] |
|
) |
|
|
|
evaluation.run(model, output_folder=f"reranker_results/{save_name}") |
|
``` |