BeardedMonster
/

SabiYarn-125M

Text Generation

Transformers

Safetensors

nanogpt-j

custom_code

Model card Files Files and versions Community

BeardedMonster commited on Jan 7

Commit

2e195a2

verified ·

1 Parent(s): 7eb7f81

Upload GPTJXForCausalLM

Browse files

Files changed (1) hide show

pretrained_model.py +4 -61

pretrained_model.py CHANGED Viewed

@@ -173,9 +173,6 @@ class GPTJXForCausalLM(PreTrainedModel):
         device = idx.device
         b, t = idx.size()
-        # attn_mask = _prepare_mask_(idx, b, eval)
-        # print("attention mask: ", attn_mask)
         assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
         pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t)
@@ -186,17 +183,16 @@ class GPTJXForCausalLM(PreTrainedModel):
         for block in self.transformer.h:
             x = block(x, attn_mask=attn_mask)
         x = self.transformer.ln_f(x)
         if targets is not None:
-            # if we are given some desired targets also calculate the loss
-            logits = self.lm_head(x)
             loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-100)
         else:
-            # inference-time mini-optimization: only forward the lm_head on the very last position
-            logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
             loss = None
-        # return {"logits": logits, "loss": loss}
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
@@ -213,38 +209,6 @@ class GPTJXForCausalLM(PreTrainedModel):
             model_inputs["attn_mask"] = attention_mask
         return model_inputs
-    # @torch.no_grad()
-    # def stream(self, idx, max_new_tokens, temperature=1.0, top_k=None,gen_mode="greedy"):
-    #     """
-    #     Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
-    #     the sequence max_new_tokens times, feeding the predictions back into the model each time.
-    #     Most likely you'll want to make sure to be in model.eval() mode of operation for this.
-    #     """
-    #     for _ in range(max_new_tokens):
-    #         # if the sequence context is growing too long we must crop it at block_size
-    #         idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
-    #         # forward the model to get the logits for the index in the sequence
-    #         logits, _ = self(idx_cond, eval=True)
-    #         # pluck the logits at the final step and scale by desired temperature
-    #         logits = logits[:, -1, :] / temperature
-    #         # optionally crop the logits to only the top k options
-    #         if top_k is not None:
-    #             v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
-    #             logits[logits < v[:, [-1]]] = -float('Inf')
-    #         # apply softmax to convert logits to (normalized) probabilities
-    #         probs = F.softmax(logits, dim=-1)
-    #         # sample from the distribution
-    #         if gen_mode == 'greedy':
-    #             idx_next = torch.argmax(probs, dim=-1).unsqueeze(0)
-    #         else:
-    #             idx_next = torch.multinomial(probs, num_samples=1)
-    #         # print(idx_next.shape, idx.shape)
-    #         idx = torch.cat((idx, idx_next), dim=1)
-    #         # append sampled index to the running sequence and continue
-    #         yield idx_next
     def crop_block_size(self, block_size):
@@ -263,24 +227,3 @@ AutoConfig.register("nanogpt-j", GPTJXConfig)
 AutoModel.register(GPTJXConfig,GPTJXForCausalLM)
 AutoModelForCausalLM.register(GPTJXConfig, GPTJXForCausalLM)
-# if __name__ == '__main__':
-#     from transformers import AutoTokenizer
-#     tokenizer = AutoTokenizer.from_pretrained("BeardedMonster/SabiYarn")
-#     input_ids = tokenizer("Awọn eeyan Cairo, ni Egypt ti bẹrẹ si n to lawọn ileesẹ to n ṣe burẹdi bayii.", return_tensors="pt")["input_ids"]
-#     targets = input_ids
-#     # config  = GPTJConfig()
-#     # config.save_pretrained("gptj-config")
-#     # new_config = GPTJ.from_pretrained("gptj-config")
-#     # model = GPTJ(config)
-#     # state_dict = torch.load('model.pt', map_location="cpu")
-#     # model.load_state_dict(state_dict)
-#     model = GPTJXForCausalLM.from_pretrained("/pretrainedmodel")
-#     # model.save_pretrained("/pretrainedmodel")
-#     # outputs = model(input_ids, targets)
-#     # print(outputs)
-#     output = model.generate(input_ids, max_new_tokens=50)
-#     print(tokenizer.decode(output[0]))
-    # print(new_config)

         device = idx.device
         b, t = idx.size()
         assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
         pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t)
         for block in self.transformer.h:
             x = block(x, attn_mask=attn_mask)
         x = self.transformer.ln_f(x)
+        logits = self.lm_head(x)  # logits over the entire sequence, shape (b, t, vocab_size)
         if targets is not None:
+            # If targets are provided, compute the loss
             loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-100)
         else:
+            # Inference-time: return logits for each timestep
             loss = None
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
             model_inputs["attn_mask"] = attention_mask
         return model_inputs
     def crop_block_size(self, block_size):
 AutoModel.register(GPTJXConfig,GPTJXForCausalLM)
 AutoModelForCausalLM.register(GPTJXConfig, GPTJXForCausalLM)