naver
/

pisco-mistral

Safetensors

English

PISCO

custom_code

Model card Files Files and versions Community

maxoul commited on Dec 22, 2024

Commit

ac4eef4

verified ·

1 Parent(s): 9e5c9e5

Upload PISCO

Browse files

Files changed (2) hide show

config.json +1 -1
modelling_pisco.py +78 -59

config.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "_name_or_path": "/scratch/1/user/mlouis/calmar/pisco_hub_models/mistral_with_mistral_labels",
   "architectures": [
     "PISCO"
   ],

 {
+  "_name_or_path": "/scratch/1/user/mlouis/calmar/pisco_hub_models/pisco-mistral",
   "architectures": [
     "PISCO"
   ],

modelling_pisco.py CHANGED Viewed

@@ -108,12 +108,29 @@ class PISCO(PreTrainedModel):
     def compress(self, enc_input_ids, enc_attention_mask):
         return self.compr_decoder(enc_input_ids, enc_attention_mask)
-    def replace_emb(self, enc_input_ids, compressed_embs, dec_input_ids):
         """
-        Compression logic (either with decoder or with dedicated compressor)
         """
-        indices = range(0, enc_input_ids.size(0) + 1, self.generation_top_k)
-        input_embeds = self.replace_embeddings(compressed_embs, dec_input_ids, indices)
         return input_embeds
     def compr_decoder(self, input_ids, attention_mask):
@@ -126,24 +143,16 @@ class PISCO(PreTrainedModel):
         if 'encoder_adapter' in self.adapter_keys:
             self.decoder.set_adapter('encoder_adapter')
-        print(self.decoder.device, input_ids.device, attention_mask.device)
         emb = self.decoder(input_ids=input_ids,
                            attention_mask=attention_mask,
                            output_hidden_states=True).hidden_states[-1]
         mask = torch.isin(input_ids, self.tokenizer.mem_token_ids_pt.to(input_ids.device))
         return emb[mask].reshape(emb.size(0), -1, emb.size(-1))
-    def prepare_encoder_inputs_to_decoder(self, texts, max_length, q_texts=None):
-        if q_texts is not None:
-            texts_to_encode = [self.tokenizer.enc_token + self.tokenizer.bos_token + '\nQuery:\n' + query + 'Document:\n' + text + self.tokenizer.eos_token
-                               for text, query in zip(texts, q_texts)]
-            inp_enc = self.tokenizer(texts_to_encode, return_tensors='pt', padding='max_length', max_length=max_length + 8, truncation=True, add_special_tokens=False)
-        else:
-            inp_enc = [self.tokenizer.enc_token + self.tokenizer.bos_token + text + self.tokenizer.eos_token for text in texts]
-            inp_enc = self.tokenizer(inp_enc, return_tensors='pt', padding="longest", max_length=max_length+3, truncation=True, add_special_tokens=False)
-        num_mem_tokens = 128 // self.compr_rate  # maybe change that
         assert num_mem_tokens == len(self.tokenizer.mem_tokens)
         inp_enc['input_ids'], inp_enc['attention_mask'] = add_memory_tokens_to_inputs(inp_enc['input_ids'],
                                                                                         inp_enc['attention_mask'],
@@ -155,28 +164,6 @@ class PISCO(PreTrainedModel):
     def prepare_encoder_inputs(self, texts, max_length):
         return self.prepare_encoder_inputs_to_decoder(texts, max_length)
-    def replace_embeddings(self, compressed_embs, dec_input_ids, indices):
-        """
-        Replace memory tokens in the decoder input to with the compressed embeddings
-        """
-        inputs_embeds = self.decoder.get_input_embeddings()(dec_input_ids)
-        num_embs = compressed_embs.size(1)
-        if self.sep:
-            slot_len = num_embs + 1
-        else:
-            slot_len = num_embs
-        # get first mem_token indices
-        first_mem_token_indices = torch.argmax((dec_input_ids == self.tokenizer.mem_token_ids[0]).int(), dim=1)
-        batch_size = inputs_embeds.size(0)
-        # for each example in batch, replace them with compressed embeddings
-        for i in range(batch_size):
-            for j in range(indices[i], indices[i + 1]):
-                start_idx = first_mem_token_indices[i].item() + (j-indices[i]) * slot_len
-                assert inputs_embeds[i, start_idx:start_idx + num_embs, :].size() == compressed_embs[j].size(), \
-                    f"{inputs_embeds[i, start_idx:start_idx + num_embs, :].size()} VS {compressed_embs[j].size()}"
-                inputs_embeds[i, start_idx:start_idx + num_embs, :] = compressed_embs[j]
-        return inputs_embeds
     def forward(self,
                 enc_input_ids: torch.LongTensor = None,
                 enc_attention_mask: torch.LongTensor = None,
@@ -204,7 +191,7 @@ class PISCO(PreTrainedModel):
         # Perform compression with gradient tracking
         compressed_embs = self.compress(enc_input_ids, enc_attention_mask)
-        inputs_embeds = self.replace_emb(enc_input_ids, compressed_embs, dec_input_ids)
         # decoding
         if 'decoder_adapter' in self.adapter_keys:
@@ -218,42 +205,80 @@ class PISCO(PreTrainedModel):
         return {"loss": decoder_outputs.loss, "logits": decoder_outputs.logits}
     def generate_from_text(self, questions: list[str], documents: list[list[str]], max_new_tokens: int = 128) -> list[str]:
-        # TODO: test
         self.generation_top_k = len(documents[0])
         assert len(documents) == len(questions)
         assert all([len(context) == len(documents[0]) for context in documents])
         flat_documents = sum(documents, [])
         model_input = {}
         input_encoder = self.prepare_encoder_inputs(flat_documents, max_length=128)
         device = self.decoder.device
         model_input['enc_input_ids'], model_input['enc_attention_mask'] = input_encoder['input_ids'].to(device), input_encoder['attention_mask'].to(device)
         instr = [self.blend_prompt_and_memory_tokens(query=q) for q in questions]
         inp_dec = self.tokenizer(instr, return_tensors='pt', padding="longest", add_special_tokens=False, truncation=True,  max_length=2048)
         model_input['dec_input_ids'], model_input['dec_attention_mask'] = inp_dec['input_ids'].to(device), inp_dec['attention_mask'].to(device)
         return self.generate(model_input, max_new_tokens=max_new_tokens)
     def compress_documents(self, documents: list[str]) -> torch.Tensor:
-        # TODO: test
         input_encoder = self.prepare_encoder_inputs(documents, max_length=128)
         enc_input_ids = input_encoder['input_ids'].to(self.decoder.device)
         attention_mask = input_encoder['attention_mask'].to(self.decoder.device)
-        print('yo', self.decoder.device, enc_input_ids.device, attention_mask.device)
         return self.compress(enc_input_ids=enc_input_ids, enc_attention_mask=attention_mask)
     def generate(self, model_input, max_new_tokens=128):
         enc_input_ids, enc_attention_mask, dec_input_ids, dec_attention_mask = model_input['enc_input_ids'], model_input['enc_attention_mask'], model_input['dec_input_ids'], model_input['dec_attention_mask']
-        print('in gen')
-        print(enc_input_ids.size())
-        print(dec_input_ids.size())
         assert enc_input_ids.size() == enc_attention_mask.size()
         if len(enc_input_ids.size()) == 3: # likely from bergen: we just flatten all of this to perform encoding in one batch
@@ -266,13 +291,11 @@ class PISCO(PreTrainedModel):
             f"{enc_input_ids.size(0)} VS {dec_input_ids.size(0)} with generation_top_k={self.generation_top_k}"
         compressed_embs = self.compress(enc_input_ids, enc_attention_mask)
-        inputs_embeds = self.replace_emb(enc_input_ids, compressed_embs, dec_input_ids)
-        # Switch adapter if we are training two different ones:
         if 'decoder_adapter' in self.adapter_keys:
-            self.decoder.set_adapter('decoder_adapter')
         output_ids = self.decoder.generate(
             inputs_embeds=inputs_embeds,
             attention_mask=dec_attention_mask,
@@ -280,18 +303,14 @@ class PISCO(PreTrainedModel):
             max_new_tokens=max_new_tokens
             )
-        decoded = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-        return decoded
     def blend_prompt_and_memory_tokens(self, query: str):
         """
         Takes care of blending the prompt with the memory tokens:
         Also returns, if a label is provided, the position of the first token index of the label (for loss comp later on)
         """
-        mem_tokens_str = ''.join(self.tokenizer.mem_tokens)
-        mem_tokens_str += self.tokenizer.sep_token
         # proper names for "eval" call, don't remove these lines
         docs = mem_tokens_str * self.generation_top_k

     def compress(self, enc_input_ids, enc_attention_mask):
         return self.compr_decoder(enc_input_ids, enc_attention_mask)
+    def replace_emb(self, compressed_embs, dec_input_ids):
         """
+        Create an input embedding vector combining the compressed_embs and the dec_input_ids
         """
+        indices = range(0, compressed_embs.size(0) + 1, self.generation_top_k)
+        input_embeds = self.decoder.get_input_embeddings()(dec_input_ids)
+        num_embs = compressed_embs.size(1)
+        if self.sep:
+            slot_len = num_embs + 1
+        else:
+            slot_len = num_embs
+        # get first mem_token indices
+        first_mem_token_indices = torch.argmax((dec_input_ids == self.tokenizer.mem_token_ids[0]).int(), dim=1)
+        batch_size = input_embeds.size(0)
+        # for each example in batch, replace them with compressed embeddings
+        for i in range(batch_size):
+            for j in range(indices[i], indices[i + 1]):
+                start_idx = first_mem_token_indices[i].item() + (j-indices[i]) * slot_len
+                assert input_embeds[i, start_idx:start_idx + num_embs, :].size() == compressed_embs[j].size(), \
+                    f"{input_embeds[i, start_idx:start_idx + num_embs, :].size()} VS {compressed_embs[j].size()}"
+                input_embeds[i, start_idx:start_idx + num_embs, :] = compressed_embs[j]
         return input_embeds
     def compr_decoder(self, input_ids, attention_mask):
         if 'encoder_adapter' in self.adapter_keys:
             self.decoder.set_adapter('encoder_adapter')
         emb = self.decoder(input_ids=input_ids,
                            attention_mask=attention_mask,
                            output_hidden_states=True).hidden_states[-1]
         mask = torch.isin(input_ids, self.tokenizer.mem_token_ids_pt.to(input_ids.device))
         return emb[mask].reshape(emb.size(0), -1, emb.size(-1))
+    def prepare_encoder_inputs_to_decoder(self, texts, max_length):
+        inp_enc = [self.tokenizer.enc_token + self.tokenizer.bos_token + text + self.tokenizer.eos_token for text in texts]
+        inp_enc = self.tokenizer(inp_enc, return_tensors='pt', padding="longest", max_length=max_length+3, truncation=True, add_special_tokens=False)
+        num_mem_tokens = 128 // self.compr_rate  # hardcode size
         assert num_mem_tokens == len(self.tokenizer.mem_tokens)
         inp_enc['input_ids'], inp_enc['attention_mask'] = add_memory_tokens_to_inputs(inp_enc['input_ids'],
                                                                                         inp_enc['attention_mask'],
     def prepare_encoder_inputs(self, texts, max_length):
         return self.prepare_encoder_inputs_to_decoder(texts, max_length)
     def forward(self,
                 enc_input_ids: torch.LongTensor = None,
                 enc_attention_mask: torch.LongTensor = None,
         # Perform compression with gradient tracking
         compressed_embs = self.compress(enc_input_ids, enc_attention_mask)
+        inputs_embeds = self.replace_emb(compressed_embs, dec_input_ids)
         # decoding
         if 'decoder_adapter' in self.adapter_keys:
         return {"loss": decoder_outputs.loss, "logits": decoder_outputs.logits}
     def generate_from_text(self, questions: list[str], documents: list[list[str]], max_new_tokens: int = 128) -> list[str]:
+        """
+        Generates answers from documents (via compression then decoding)
+        questions: list of string
+        documents: list of list of strings (they should all be of equal length: the nb of doc for each question)
+        """
         self.generation_top_k = len(documents[0])
         assert len(documents) == len(questions)
         assert all([len(context) == len(documents[0]) for context in documents])
         flat_documents = sum(documents, [])
         model_input = {}
+        # Creating encoder inputs:
         input_encoder = self.prepare_encoder_inputs(flat_documents, max_length=128)
         device = self.decoder.device
         model_input['enc_input_ids'], model_input['enc_attention_mask'] = input_encoder['input_ids'].to(device), input_encoder['attention_mask'].to(device)
+        # Creating decoder inputs
         instr = [self.blend_prompt_and_memory_tokens(query=q) for q in questions]
         inp_dec = self.tokenizer(instr, return_tensors='pt', padding="longest", add_special_tokens=False, truncation=True,  max_length=2048)
         model_input['dec_input_ids'], model_input['dec_attention_mask'] = inp_dec['input_ids'].to(device), inp_dec['attention_mask'].to(device)
+        # Generation
         return self.generate(model_input, max_new_tokens=max_new_tokens)
+    def generate_from_compressed_documents_and_questions(self, questions: list[str], compressed_documents: torch.Tensor, max_new_tokens: int = 128) -> list[str]:
+        """
+        Generates answers from compressed documents
+        questions: list of string
+        compressed_documents: torch tensor, its first dimension should be a multiple of len(questions)
+        """
+        print(compressed_documents.size(), len(questions))
+        self.generation_top_k = compressed_documents.size(0) // len(questions)
+        assert compressed_documents.size(0) % self.generation_top_k == 0, f"{compressed_documents.size(0)} {self.generation_top_k}"
+        # Creating decoder inputs
+        instr = [self.blend_prompt_and_memory_tokens(query=q) for q in questions]
+        inp_dec = self.tokenizer(instr, return_tensors='pt', padding="longest", add_special_tokens=False, truncation=True,  max_length=2048)
+        device = self.decoder.device
+        dec_input_ids, dec_attention_mask = inp_dec['input_ids'].to(device), inp_dec['attention_mask'].to(device)
+        # Creating input decoder embeddings from prompt + compressed documents
+        inputs_embeds = self.replace_emb(compressed_documents, dec_input_ids)
+        # Activating decoder generator:
+        if 'decoder_adapter' in self.adapter_keys:
+            self.decoder.set_adapter('decoder_adapter')
+        output_ids = self.decoder.generate(
+            inputs_embeds=inputs_embeds,
+            attention_mask=dec_attention_mask,
+            generation_config=self.generation_config,
+            max_new_tokens=max_new_tokens
+            )
+        # de-tokenizing
+        return self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)
     def compress_documents(self, documents: list[str]) -> torch.Tensor:
+        """
+        Compress a list of documents
+        """
         input_encoder = self.prepare_encoder_inputs(documents, max_length=128)
         enc_input_ids = input_encoder['input_ids'].to(self.decoder.device)
         attention_mask = input_encoder['attention_mask'].to(self.decoder.device)
         return self.compress(enc_input_ids=enc_input_ids, enc_attention_mask=attention_mask)
     def generate(self, model_input, max_new_tokens=128):
+        """
+        Generation pipeline including compression + decoding from compressed
+        """
         enc_input_ids, enc_attention_mask, dec_input_ids, dec_attention_mask = model_input['enc_input_ids'], model_input['enc_attention_mask'], model_input['dec_input_ids'], model_input['dec_attention_mask']
         assert enc_input_ids.size() == enc_attention_mask.size()
         if len(enc_input_ids.size()) == 3: # likely from bergen: we just flatten all of this to perform encoding in one batch
             f"{enc_input_ids.size(0)} VS {dec_input_ids.size(0)} with generation_top_k={self.generation_top_k}"
         compressed_embs = self.compress(enc_input_ids, enc_attention_mask)
+        inputs_embeds = self.replace_emb(compressed_embs, dec_input_ids)
         if 'decoder_adapter' in self.adapter_keys:
+            self.decoder.set_adapter('decoder_adapter')
         output_ids = self.decoder.generate(
             inputs_embeds=inputs_embeds,
             attention_mask=dec_attention_mask,
             max_new_tokens=max_new_tokens
             )
+        return self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)
     def blend_prompt_and_memory_tokens(self, query: str):
         """
         Takes care of blending the prompt with the memory tokens:
         Also returns, if a label is provided, the position of the first token index of the label (for loss comp later on)
         """
+        mem_tokens_str = ''.join(self.tokenizer.mem_tokens) + self.tokenizer.sep_token
         # proper names for "eval" call, don't remove these lines
         docs = mem_tokens_str * self.generation_top_k