File size: 7,156 Bytes
a2db297 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
from abc import ABC, abstractmethod
from typing import List, Dict
import torch
from .utils import IMAGE_TOKEN_INDEX, IGNORE_INDEX, IMAGE_TOKEN
class ConversationFormatter(ABC):
support_tokenizer_types = None
def __init__(self, tokenizer):
tokenizer_type = type(tokenizer).__name__
assert tokenizer_type in self.support_tokenizer_types, \
f'Invalid tokenizer type, expected one from `{self.support_tokenizer_types}`, but got `{tokenizer_type}`'
@abstractmethod
def format(self, conversations: List[Dict], generation_preface=None):
pass
@abstractmethod
def format_query(self, query, generation_preface=""):
pass
class QwenConversationFormatter(ConversationFormatter):
support_tokenizer_types = ['QWenTokenizer', 'Qwen2TokenizerFast']
def __init__(self, tokenizer):
super().__init__(tokenizer)
self.tokenizer = tokenizer
self.from2role = {
"system": "<|im_start|>system\n",
"human": "<|im_start|>user\n",
"gpt": "<|im_start|>assistant\n",
}
self.gpt_token_num = None
self.im_end = "<|im_end|>"
self.image_symbol = IMAGE_TOKEN
self.image_token_index = IMAGE_TOKEN_INDEX
self.ignore_index = IGNORE_INDEX
self.default_system_prompt = "You are a helpful assistant."
def _tokenize_with_image_symbol(self, text):
text_chunks = [self.tokenizer(chunk, add_special_tokens=False).input_ids for chunk in
text.split(self.image_symbol)]
token_ids = []
num_chuck = len(text_chunks)
for i, chunk in enumerate(text_chunks):
token_ids.extend(chunk)
if i < num_chuck - 1:
token_ids.append(self.image_token_index)
return token_ids
def format(self, conversations: List[Dict], generation_preface=None):
if self.gpt_token_num is None:
self.gpt_token_num = len(self.tokenizer(self.from2role["gpt"], add_special_tokens=False).input_ids)
if conversations[0]["from"] != "system":
conversations.insert(0, {
"from": "system",
"value": self.default_system_prompt
})
if generation_preface is not None:
conversations.append({
"from": "gpt",
"value": generation_preface
})
prompt = ""
input_ids = []
labels = []
num_conversation = len(conversations)
for i, conversation in enumerate(conversations):
frm = conversation["from"]
role = self.from2role[frm]
message = conversation["value"]
text = role + message
if i < num_conversation - 1 or generation_preface is None:
text += self.im_end
if i < num_conversation - 1:
text += '\n'
prompt += text
token_ids = self._tokenize_with_image_symbol(text)
input_ids.extend(token_ids)
label_ids = [self.ignore_index] * len(token_ids)
if frm == "gpt":
label_ids[self.gpt_token_num:] = token_ids[self.gpt_token_num:]
labels.extend(label_ids)
assert self._tokenize_with_image_symbol(prompt) == input_ids
assert len(input_ids) == len(labels)
input_ids = torch.tensor(input_ids, dtype=torch.long)
labels = torch.tensor(labels, dtype=torch.long)
return prompt, input_ids, labels
def format_query(self, query, generation_preface=""):
prompt, input_ids, _ = self.format([{
"from": "human",
"value": query
}], generation_preface=generation_preface)
return prompt, input_ids
class Llama3ConversationFormatter(ConversationFormatter):
support_tokenizer_types = ['PreTrainedTokenizerFast']
def __init__(self, tokenizer):
super().__init__(tokenizer)
self.tokenizer = tokenizer
self.from2role = {
"system": "<|start_header_id|>system<|end_header_id|>\n\n",
"human": "<|start_header_id|>user<|end_header_id|>\n\n",
"gpt": "<|start_header_id|>assistant<|end_header_id|>\n\n",
}
self.gpt_token_num = None
self.im_end = "<|eot_id|>"
self.image_symbol = IMAGE_TOKEN
self.image_token_index = IMAGE_TOKEN_INDEX
self.ignore_index = IGNORE_INDEX
self.default_system_prompt = "You are a helpful and honest multimodal assistant."
self.bos_token = "<|begin_of_text|>"
self.bos_token_ids = None
def _tokenize_with_image_symbol(self, text):
text_chunks = [self.tokenizer(chunk, add_special_tokens=False).input_ids for chunk in
text.split(self.image_symbol)]
token_ids = []
num_chuck = len(text_chunks)
for i, chunk in enumerate(text_chunks):
token_ids.extend(chunk)
if i < num_chuck - 1:
token_ids.append(self.image_token_index)
return token_ids
def format(self, conversations: List[Dict], generation_preface=None):
if self.gpt_token_num is None:
self.gpt_token_num = len(self.tokenizer(self.from2role["gpt"], add_special_tokens=False).input_ids)
if self.bos_token_ids is None:
self.bos_token_ids = self.tokenizer(self.bos_token, add_special_tokens=False).input_ids
if conversations[0]["from"] != "system":
conversations.insert(0, {
"from": "system",
"value": self.default_system_prompt
})
if generation_preface is not None:
conversations.append({
"from": "gpt",
"value": generation_preface
})
prompt = "" + self.bos_token
input_ids = [] + self.bos_token_ids
labels = [] + [IGNORE_INDEX] * len(input_ids)
num_conversation = len(conversations)
for i, conversation in enumerate(conversations):
frm = conversation["from"]
role = self.from2role[frm]
message = conversation["value"].strip()
text = role + message
if i < num_conversation - 1 or generation_preface is None:
text += self.im_end
prompt += text
token_ids = self._tokenize_with_image_symbol(text)
input_ids.extend(token_ids)
label_ids = [self.ignore_index] * len(token_ids)
if frm == "gpt":
label_ids[self.gpt_token_num:] = token_ids[self.gpt_token_num:]
labels.extend(label_ids)
assert self._tokenize_with_image_symbol(prompt) == input_ids
assert len(input_ids) == len(labels)
input_ids = torch.tensor(input_ids, dtype=torch.long)
labels = torch.tensor(labels, dtype=torch.long)
return prompt, input_ids, labels
def format_query(self, query, generation_preface=""):
prompt, input_ids, _ = self.format([{
"from": "human",
"value": query
}], generation_preface=generation_preface)
return prompt, input_ids
|