turing-motors
/

Terra

autonomous driving

video generation

Model card Files Files and versions Community

koukyo1994 commited on Dec 10, 2024

Commit

d84275c

·

verified ·

1 Parent(s): 46f772f

update llama_action model

Files changed (1) hide show

modeling_llama_action.py +14 -1

modeling_llama_action.py CHANGED Viewed

@@ -4,6 +4,7 @@ import torch
 import torch.nn as nn
 from transformers import LlamaForCausalLM
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from .configuration_llama_action import LlamaActionConfig
@@ -204,11 +205,23 @@ class LlamaActionForCausalLM(LlamaForCausalLM):
         seq_length = input_ids.size(1)
         n_frames = seq_length // self.num_image_patches
         attention_mask_length = n_frames * (self.num_image_patches + self.num_action_embeddings)
         if seq_length % self.num_image_patches != 0:
             n_last_frame_tokens = seq_length % self.num_image_patches
             attention_mask_length += n_last_frame_tokens
         else:
-            print(f"attempting to generate new frame - frame no: {n_frames + 1}")
         attention_mask = torch.ones((batch_size, attention_mask_length), device=input_ids.device, dtype=torch.long)
         # cut decoder_input_ids if past_key_values is used
         if past_key_values is not None and len(past_key_values) > 0:

 import torch.nn as nn
 from transformers import LlamaForCausalLM
 from transformers.modeling_outputs import CausalLMOutputWithPast
+from tqdm import tqdm
 from .configuration_llama_action import LlamaActionConfig
         seq_length = input_ids.size(1)
         n_frames = seq_length // self.num_image_patches
         attention_mask_length = n_frames * (self.num_image_patches + self.num_action_embeddings)
+        if kwargs.pop("show_progress", False):
+            prefix = kwargs.pop("prefix", "")
+            max_length = kwargs.pop("max_length")
+            if past_key_values is None or len(past_key_values) == 0:
+                pbar = tqdm(total=max_length - len(input_ids[0]), desc=prefix, leave=False)
+                postfix = f"Frame [{n_frames + 1}/{max_length // self.num_image_patches}]"
+                pbar.set_postfix_str(postfix)
+            else:
+                pbar.update()
         if seq_length % self.num_image_patches != 0:
             n_last_frame_tokens = seq_length % self.num_image_patches
             attention_mask_length += n_last_frame_tokens
         else:
+            if kwargs.pop("show_progress", False):
+                postfix = f"Frame [{n_frames + 1}/{max_length // self.num_image_patches}]"
+                pbar.set_postfix(postfix)
         attention_mask = torch.ones((batch_size, attention_mask_length), device=input_ids.device, dtype=torch.long)
         # cut decoder_input_ids if past_key_values is used
         if past_key_values is not None and len(past_key_values) > 0: