|
import torch |
|
from typing import Dict |
|
import transformers |
|
from transformers import LlamaForCausalLM, LlamaTokenizer |
|
|
|
DEFAULT_PAD_TOKEN = "[PAD]" |
|
DEFAULT_EOS_TOKEN = "</s>" |
|
DEFAULT_BOS_TOKEN = "</s>" |
|
DEFAULT_UNK_TOKEN = "</s>" |
|
MAX_SOURCE_LENGTH = 512 |
|
MAX_TARGET_LENGTH = 512 |
|
print("Max source length: ", MAX_SOURCE_LENGTH) |
|
print("MAX target length: ", MAX_TARGET_LENGTH) |
|
|
|
def smart_tokenizer_and_embedding_resize( |
|
special_tokens_dict: Dict, |
|
tokenizer: transformers.PreTrainedTokenizer, |
|
): |
|
"""Resize tokenizer and embedding. |
|
Note: This is the unoptimized version that may make your embedding size not be divisible by 64. |
|
""" |
|
tokenizer.add_special_tokens(special_tokens_dict) |
|
tokenizer.add_special_tokens( |
|
{ |
|
"eos_token": DEFAULT_EOS_TOKEN, |
|
"bos_token": DEFAULT_BOS_TOKEN, |
|
"unk_token": DEFAULT_UNK_TOKEN, |
|
} |
|
) |
|
|
|
device_id = ( |
|
torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") |
|
) |
|
|
|
class InstructScore: |
|
def __init__(self): |
|
self.tokenizer = LlamaTokenizer.from_pretrained( |
|
"InstructScore_Tok", model_max_length=MAX_SOURCE_LENGTH, use_fast=False |
|
) |
|
|
|
self.tokenizer.padding_side = "left" |
|
|
|
smart_tokenizer_and_embedding_resize( |
|
special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN), |
|
tokenizer=self.tokenizer, |
|
) |
|
self.model = LlamaForCausalLM.from_pretrained('InstructScore_English').to(device_id) |
|
self.model.eval() |
|
def score(self, ref_ls, out_ls): |
|
prompt_ls=\ |
|
[f"You are evaluating Chinese-to-English Machine translation task. The correct translation is \"{ref}\". The model generated translation is \"{out}\". Please identify all errors within each model output, up to a maximum of five. For each error, please give me the corresponding error type, major/minor label, error location of the model generated translation and explanation for the error. Major errors can confuse or mislead the reader due to significant change in meaning, while minor\ |
|
errors don't lead to loss of meaning but will be noticed." for ref, out in zip(ref_ls, out_ls)] |
|
|
|
with torch.no_grad(): |
|
inputs = self.tokenizer( |
|
prompt_ls, |
|
return_tensors="pt", |
|
padding=True, |
|
truncation=True, |
|
max_length=MAX_SOURCE_LENGTH, |
|
) |
|
outputs = self.model.generate( |
|
inputs["input_ids"].to(device_id), |
|
attention_mask=inputs["attention_mask"].to(device_id), |
|
max_new_tokens=MAX_TARGET_LENGTH, |
|
) |
|
batch_outputs = self.tokenizer.batch_decode( |
|
outputs, |
|
skip_special_tokens=True, |
|
clean_up_tokenization_spaces=True, |
|
) |
|
scores_ls = [(-1) * output.count("Major/minor: Minor") + (-5) * output.count("Major/minor: Major") for output in batch_outputs] |
|
return batch_outputs, scores_ls |
|
|
|
def main(): |
|
refs = ["SEScore is a simple but effective next generation text generation evaluation metric", "SEScore it really works"] |
|
outs = ["SEScore is a simple effective text evaluation metric for next generation", "SEScore is not working"] |
|
|
|
scorer = InstructScore() |
|
batch_outputs, scores_ls = scorer.score(refs, outs) |
|
print(batch_outputs) |
|
print(scores_ls) |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|