Update processing_videollama3.py
Browse files
processing_videollama3.py
CHANGED
@@ -682,14 +682,15 @@ class Videollama3Qwen2Processor(ProcessorMixin):
|
|
682 |
kwargs.pop("padding")
|
683 |
kwargs.pop("padding_side")
|
684 |
|
685 |
-
|
686 |
-
|
687 |
-
|
688 |
-
|
689 |
-
|
690 |
-
|
691 |
-
|
692 |
-
|
|
|
693 |
|
694 |
text_inputs = self.tokenizer(text, **kwargs)
|
695 |
return text_inputs
|
|
|
682 |
kwargs.pop("padding")
|
683 |
kwargs.pop("padding_side")
|
684 |
|
685 |
+
if len(grid_sizes) > 0:
|
686 |
+
image_idx = 0
|
687 |
+
while DEFAULT_IMAGE_TOKEN in text:
|
688 |
+
num_tokens = self._get_visual_seq_len(grid_sizes[image_idx])
|
689 |
+
text = text.replace(DEFAULT_IMAGE_TOKEN, "<placeholder>" * num_tokens, 1)
|
690 |
+
image_idx += 1
|
691 |
+
text = text.replace("<placeholder>", DEFAULT_IMAGE_TOKEN)
|
692 |
+
|
693 |
+
assert len(grid_sizes) == image_idx, "Number of images does not match the number of image tokens in the text."
|
694 |
|
695 |
text_inputs = self.tokenizer(text, **kwargs)
|
696 |
return text_inputs
|