code

Browse files

Files changed (6) hide show

audio_processing_mllama.py +66 -0
configuration_llama3.py +112 -0
mllama_audio_model.py +41 -0
modeling_llama3.py +285 -0
preprocessor_config.json +38 -0
processing_mllama.py +377 -0

audio_processing_mllama.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import math
+from typing import Dict, List, Optional, Union
+import numpy as np
+import transformers
+from transformers.tokenization_utils_base import AudioInput
+from transformers.utils import TensorType
+from transformers.feature_extraction_utils import BatchFeature
+from transformers import AutoFeatureExtractor, Wav2Vec2FeatureExtractor, Wav2Vec2Config
+def build_audio_tokens(text: List[str], audio_features: Union[Dict, List[List[np.ndarray]]], audio_token="<|audio|>") -> Dict:
+    if not isinstance(audio_features, list):
+        audio_features = audio_features['audio_features']
+    bs = audio_features.shape[0]
+    for i in range(bs):
+        for j in range(len(audio_features[i])):
+            tgt_token = f"<|audio_{j+1}|>" * get_num_embeddings(audio_features[i][j].shape[0])
+            text[i] = text[i].replace(audio_token, tgt_token, 1)
+    return text
+def calculate_output_length(length_in, kernel_size, stride=1, padding=0, dilation=1):
+    return (length_in + 2 * padding - dilation * (kernel_size - 1) - 1) // stride + 1
+def get_num_embeddings(wav_length: int, config: Wav2Vec2Config) -> int:
+    curr_len = wav_length
+    for i in range(config.num_feat_extract_layers):
+        curr_len = calculate_output_length(curr_len, config.conv_kernel[i], stride=config.conv_stride[i])
+    curr_len = calculate_output_length(curr_len, config.adapter_kernel_size, stride=config.adapter_stride)
+    return curr_len + 2 # 2 = <|begin_of_audio|>, <|end_of_audio|>
+class MllamaAudioFeatureExtractor(Wav2Vec2FeatureExtractor):
+    def __call__(
+        self,
+        batch_audio_clips: List[List[AudioInput]],
+        return_tensors: Optional[Union[str, TensorType]] = None,
+    ) -> BatchFeature:
+        audio_features = [[ super(MllamaAudioFeatureExtractor, self).__call__(audio_j, sampling_rate=16000, return_attention_mask=False)['input_features'][0] for audio_j in audio_i ] for audio_i in batch_audio_clips ]
+        packed_audio_features = self.pack_audio_clips(audio_features)
+        encoded_audio_inputs = BatchFeature(
+            data={
+                "audio_features": packed_audio_features,
+            },
+            tensor_type=return_tensors,
+        )
+        return encoded_audio_inputs
+    def pack_audio_clips(self, batch_audio_clips: List[List[np.ndarray]]) -> np.ndarray:
+        assert batch_audio_clips[0][0].ndim == 2 # sequence length x feature dimension
+        # Determine output shape: (batch_size, max_num_clips, max_frames, feature_dim)
+        batch_size = len(batch_audio_clips)
+        max_num_clips = max([len(clips) for clips in batch_audio_clips])
+        max_frames = max([clip.shape[0] for clips in batch_audio_clips for clip in clips])
+        feature_dim = batch_audio_clips[0][0].shape[1]
+        stacked_audio_clips = np.zeros((batch_size, max_num_clips, max_frames, feature_dim), dtype=np.float32)
+        for i, clips in enumerate(batch_audio_clips):
+            for j, clip in enumerate(clips):
+                stacked_audio_clips[i, j, :clip.shape[0], :] = clip
+        return stacked_audio_clips
+AutoFeatureExtractor.register("MllamaAudioFeatureExtractor", MllamaAudioFeatureExtractor)
+transformers.MllamaAudioFeatureExtractor = MllamaAudioFeatureExtractor

configuration_llama3.py ADDED Viewed

	@@ -0,0 +1,112 @@

+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Mllama model configuration"""
+import os
+from typing import Dict, List, Optional, Union
+import transformers
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.utils import logging
+from transformers import Wav2Vec2Config, AutoConfig
+from transformers.models.mllama.configuration_mllama import MllamaVisionConfig, MllamaTextConfig
+logger = logging.get_logger(__name__)
+class Llama3Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MllamaForConditionalGeneration`]. It is used to instantiate an
+    Mllama model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Mllama-9B.
+    e.g. [meta-llama/Llama-3.2-11B-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `MllamaVisionConfig`):
+            The config object or dictionary of the vision backbone.
+        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `MllamaTextConfig`):
+            The config object or dictionary of the text backbone.
+        image_token_index (`int`, *optional*, defaults to 128256):
+            The image token index to encode the image prompt.
+    Example:
+    ```python
+    >>> from transformers import MllamaForConditionalGeneration, MllamaConfig, MllamaVisionConfig, MllamaTextConfig
+    >>> # Initializing a CLIP-vision config
+    >>> vision_config = MllamaVisionConfig()
+    >>> # Initializing a Llama config
+    >>> text_config = MllamaTextConfig()
+    >>> # Initializing a mllama-11b style configuration
+    >>> configuration = MllamaConfig(vision_config, text_config)
+    >>> # Initializing a model from the mllama-11b style configuration
+    >>> model = MllamaForConditionalGeneration(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "llama3"
+    is_composition = True
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        audio_config=None,
+        image_token_index=128256,
+        audio_token_index=128257,
+        **kwargs,
+    ):
+        if vision_config is None:
+            self.vision_config = MllamaVisionConfig()
+            logger.info("vision_config is None, using default mllama vision config")
+        elif isinstance(vision_config, dict):
+            self.vision_config = MllamaVisionConfig(**vision_config)
+        elif isinstance(vision_config, MllamaVisionConfig):
+            self.vision_config = vision_config
+        self.image_token_index = image_token_index
+        if audio_config is None:
+            self.audio_config = Wav2Vec2Config()
+            logger.info("audio_config is None, using default mllama audio config")
+        elif isinstance(audio_config, dict):
+            self.audio_config = Wav2Vec2Config(**audio_config)
+        elif isinstance(audio_config, Wav2Vec2Config):
+            self.audio_config = audio_config
+        self.audio_token_index = audio_token_index
+        if text_config is None:
+            self.text_config = MllamaTextConfig()
+            logger.info("text_config is None, using default mllama text config")
+        elif isinstance(text_config, dict):
+            self.text_config = MllamaTextConfig(**text_config)
+        elif isinstance(text_config, MllamaTextConfig):
+            self.text_config = text_config
+        super().__init__(**kwargs)
+AutoConfig.register("llama3", Llama3Config)
+transformers.Llama3Config = Llama3Config

mllama_audio_model.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from typing import Optional, Tuple, Union
+import torch
+from torch import nn
+from transformers.modeling_outputs import BaseModelOutput
+from transformers import Wav2Vec2Model, Wav2Vec2Config, MllamaPreTrainedModel
+from transformers.models.wav2vec2_bert.modeling_wav2vec2_bert import Wav2Vec2BertAdapterLayer
+from configuration_llama3 import Llama3Config
+class Llama3Embedding(MllamaPreTrainedModel):
+    config_class = Llama3Config
+    base_model_prefix = "audio_model"
+    def __init__(self, config: Llama3Config):
+        super().__init__(config)
+        assert config.audio_config.output_hidden_size == config.text_config.hidden_size
+        self.text_embeddings = nn.Embedding(config.text_config.vocab_size, config.text_config.hidden_size, config.text_config.pad_token_id)
+        config.audio_config.add_adapter = False
+        self.audio_model = Wav2Vec2Model(config.audio_config)
+        self.start_of_audio = nn.Parameter(data=torch.zeros((1, config.audio_config.output_hidden_size)), requires_grad=True)
+        self.end_of_audio = nn.Parameter(data=torch.zeros((1, config.audio_config.output_hidden_size)), requires_grad=True)
+        self.text_config = config.text_config
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        audio_features: Optional[torch.Tensor] = None,
+    ) -> Union[BaseModelOutput, Tuple[torch.Tensor, ...]]:
+        input_embeddings = self.text_embeddings(input_ids.clamp_min(0).detach())
+        if audio_features is None:
+            return input_embeddings
+        bs, max_num_img, l, d = audio_features.shape
+        audio_embeddings = self.audio_model(input_features=audio_features.view((bs*max_num_img, l, d)))['last_hidden_state']
+        audio_embeddings = audio_embeddings.view((bs, max_num_img, -1, self.start_of_audio.shape[-1]))
+        for i in range(bs):
+            for j in range(max_num_img):
+                audio_id = -1 - j
+                if torch.any(input_ids[i] == audio_id):
+                    positions = torch.nonzero(input_ids[i] == audio_id, as_tuple=True)
+                    input_embeddings[i] = input_embeddings[i].index_put(positions, torch.concat([self.start_of_audio, audio_embeddings[i, j, :, :], self.end_of_audio]), accumulate=False)
+        return input_embeddings

modeling_llama3.py ADDED Viewed

	@@ -0,0 +1,285 @@

+# coding=utf-8
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+import transformers
+from transformers import MllamaPreTrainedModel, MllamaVisionModel, MllamaForCausalLM, AutoModel
+from transformers.generation import GenerationMixin
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.utils import logging
+from transformers.models.mllama.modeling_mllama import _prepare_cross_attention_mask
+from configuration_llama3 import Llama3Config
+from mllama_audio_model import Llama3Embedding
+logger = logging.get_logger(__name__)
+class Llama3ForConditionalGeneration(MllamaPreTrainedModel, GenerationMixin):
+    config_class = Llama3Config
+    base_model_prefix = "model"
+    _supports_quantized_cache = False  # quant cache not supported in encoder-decoder setting
+    def __init__(self, config: Llama3Config):
+        super().__init__(config)
+        self.vocab_size = config.text_config.vocab_size
+        self.hidden_size = config.text_config.hidden_size
+        self.max_num_tiles = config.vision_config.max_num_tiles
+        self.vision_output_dim = config.vision_config.vision_output_dim
+        self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
+        self.vision_model = MllamaVisionModel._from_config(config.vision_config)
+        self.language_model = MllamaForCausalLM._from_config(config.text_config)
+        self.embed_tokens = Llama3Embedding(config)
+        self.multi_modal_projector = nn.Linear(
+            config.vision_config.vision_output_dim,
+            config.text_config.hidden_size,
+            bias=True,
+        )
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens.text_embeddings
+    def set_input_embeddings(self, value):
+        self.embed_tokens.text_embeddings = value
+    def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+    def set_decoder(self, decoder):
+        self.language_model.set_decoder(decoder)
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+    def tie_weights(self):
+        return self.language_model.tie_weights()
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        audio_features: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        aspect_ratio_mask: Optional[torch.Tensor] = None,
+        aspect_ratio_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_states: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+            num_logits_to_keep (`int`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+        Returns:
+        Example:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, MllamaForConditionalGeneration
+        >>> checkpoint = "meta-llama/Llama-3.2-11B-Vision"
+        >>> model = MllamaForConditionalGeneration.from_pretrained(checkpoint)
+        >>> processor = AutoProcessor.from_pretrained(checkpoint)
+        >>> prompt = "<|image|>If I had to write a haiku for this one"
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(text=prompt, images=image, return_tensors="pt")
+        >>> # Generate
+        >>> output = model.generate(**inputs, max_new_tokens=15)
+        >>> prompt_len = inputs.input_ids.shape[-1]
+        >>> generated_ids = output[:, prompt_len:]
+        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+        >>> print(generated_text)
+        [', it would be:.\\nA stop sign in Chinatown.\\n']
+        ```
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.text_config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.text_config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.text_config.use_return_dict
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if pixel_values is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+            )
+        if pixel_values is not None and cross_attention_states is not None:
+            raise ValueError("`pixel_values` and `cross_attention_states` cannot be provided simultaneously")
+        if pixel_values is not None:
+            if aspect_ratio_ids is None:
+                raise ValueError("`aspect_ratio_ids` must be provided if `pixel_values` is provided")
+            # get vision tokens from vision model
+            vision_outputs = self.vision_model(
+                pixel_values=pixel_values,
+                aspect_ratio_ids=aspect_ratio_ids,
+                aspect_ratio_mask=aspect_ratio_mask,
+                output_hidden_states=output_hidden_states,
+                output_attentions=output_attentions,
+                return_dict=return_dict,
+            )
+            cross_attention_states = vision_outputs[0]
+            cross_attention_states = self.multi_modal_projector(cross_attention_states).reshape(
+                -1, cross_attention_states.shape[-2], self.hidden_size
+            )
+        if cross_attention_mask is not None:
+            cross_attention_mask, full_text_row_masked_out_mask = _prepare_cross_attention_mask(
+                cross_attention_mask,
+                num_vision_tokens=self.vision_model.num_patches,
+                dtype=self.dtype,
+            )
+        else:
+            full_text_row_masked_out_mask = None
+        if cross_attention_mask is not None and cache_position is not None:
+            cross_attention_mask = cross_attention_mask[:, :, cache_position]
+            full_text_row_masked_out_mask = full_text_row_masked_out_mask[:, :, cache_position]
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids=input_ids, audio_features=audio_features)
+        outputs = self.language_model(
+            input_ids=None,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            cross_attention_states=cross_attention_states,
+            cross_attention_mask=cross_attention_mask,
+            full_text_row_masked_out_mask=full_text_row_masked_out_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            num_logits_to_keep=num_logits_to_keep,
+        )
+        return outputs
+    def prepare_inputs_for_generation(
+        self,
+        input_ids=None,
+        audio_features=None,
+        inputs_embeds=None,
+        attention_mask=None,
+        position_ids=None,
+        pixel_values=None,
+        aspect_ratio_ids=None,
+        aspect_ratio_mask=None,
+        cross_attention_mask=None,
+        past_key_values=None,
+        use_cache=False,
+        cache_position=None,
+        num_logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+        # Exception 1: when passing input_embeds, input_ids may be missing entries
+        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+        if past_key_values is not None:
+            if inputs_embeds is not None:  # Exception 1
+                input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, cache_position]
+        # TODO: we have no attention_mask so this won't work, check if we really won't need attention mask and find another way
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+                # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s  `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture.
+                position_ids = position_ids.clone(memory_format=torch.contiguous_format)
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and cache_position[0] == 0:
+            model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
+        else:
+            # The clone here is for the same reason as for `position_ids`.
+            model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
+        if num_logits_to_keep is not None:
+            model_inputs["num_logits_to_keep"] = num_logits_to_keep
+        model_inputs.update(
+            {
+                "audio_features": audio_features,
+                "position_ids": position_ids,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+                "cross_attention_mask": cross_attention_mask,
+            }
+        )
+        # If we're in pre-fill or cacheless decoding step, then we need pixel_values and aspect ratios
+        # to compute image hidden states, otherwise they are cached within each cross attn layer
+        if cache_position[0] == 0:
+            model_inputs["pixel_values"] = pixel_values
+            model_inputs["aspect_ratio_ids"] = aspect_ratio_ids
+            model_inputs["aspect_ratio_mask"] = aspect_ratio_mask
+        return model_inputs
+    def _update_model_kwargs_for_generation(self, outputs, model_kwargs, is_encoder_decoder, **kwargs):
+        cross_attention_mask_prev = model_kwargs.get("cross_attention_mask", None)
+        model_kwargs = super()._update_model_kwargs_for_generation(
+            outputs=outputs,
+            model_kwargs=model_kwargs,
+            is_encoder_decoder=is_encoder_decoder,
+            **kwargs,
+        )
+        # add cross-attn mask for new token
+        if cross_attention_mask_prev is not None:
+            model_kwargs["cross_attention_mask"] = torch.cat(
+                [cross_attention_mask_prev, cross_attention_mask_prev[:, -1:, ...]], dim=1
+            )
+        return model_kwargs
+AutoModel.register(Llama3Config, Llama3ForConditionalGeneration)
+transformers.Llama3ForConditionalGeneration = Llama3ForConditionalGeneration

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "auto_map": {
+    "AutoFeatureExtractor": "audio_processing_mllama.MllamaAudioFeatureExtractor",
+    "AutoProcessor": "processing_mllama.MllamaProcessor"
+  },
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_pad": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "feature_extractor_type": "MllamaAudioFeatureExtractor",
+  "feature_size": 80,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "MllamaImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "max_image_tiles": 4,
+  "num_mel_bins": 80,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "processor_class": "MllamaProcessor",
+  "resample": 2,
+  "rescale_factor": 0.00392156862745098,
+  "return_attention_mask": true,
+  "sampling_rate": 16000,
+  "size": {
+    "height": 560,
+    "width": 560
+  },
+  "stride": 2
+}

processing_mllama.py ADDED Viewed

	@@ -0,0 +1,377 @@

+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Processor class for Mllama."""
+from typing import List, Optional, Union
+import numpy as np
+import torch
+import transformers
+from transformers import AutoProcessor
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack, AudioKwargs
+from transformers.tokenization_utils_base import (
+    PreTokenizedInput,
+    TextInput,
+    AudioInput,
+)
+# TODO: Can we do it that way or its better include as "Copied from ..."
+from transformers.models.mllama.image_processing_mllama import make_list_of_images
+from .audio_processing_mllama import build_audio_tokens
+class MllamaImagesKwargs(ImagesKwargs, total=False):
+    max_image_tiles: Optional[int]
+class MllamaProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: MllamaImagesKwargs
+    _defaults = {
+        "image_kwargs": {
+            "max_image_tiles": 4,
+        },
+    }
+def get_cross_attention_token_mask(input_ids: List[int], image_token_id: int) -> List[List[int]]:
+    """
+    Generate a cross-attention token mask for image tokens in the input sequence.
+    This function identifies the positions of image tokens in the input sequence and creates
+    a mask that defines which subsequent tokens each image token should attend to.
+    Args:
+        input_ids (List[int]): A list of token ids representing the input sequence.
+        image_token_id (int): The id of the token used to represent images in the sequence.
+    Returns:
+        List[List[int]]: A list of [start, end] pairs, where each pair represents the range
+        of tokens an image token should attend to.
+    Notes:
+        - If no image tokens are present, an empty list is returned.
+        - For a single image token, it attends to all subsequent tokens until the end of the sequence.
+        - For multiple image tokens, each attends to tokens up to the next image token or the end of the sequence.
+        - Consecutive image tokens are treated as a group and attend to all subsequent tokens together.
+    """
+    image_token_locations = [i for i, token in enumerate(input_ids) if token == image_token_id]
+    if len(image_token_locations) == 0:
+        return []
+    # only one image present, unmask until end of sequence
+    if len(image_token_locations) == 1:
+        return [[image_token_locations[0], -1]]
+    vision_masks = [[loc1, loc2] for loc1, loc2 in zip(image_token_locations[:-1], image_token_locations[1:])]
+    # last image will attend to all subsequent text
+    vision_masks.append([image_token_locations[-1], len(input_ids)])
+    # if there are two or more consecutive vision tokens,
+    # they should all attend to all subsequent
+    # text present
+    last_mask_end = vision_masks[-1][1]
+    for vision_mask in vision_masks[::-1]:
+        if vision_mask[0] == vision_mask[1] - 1:
+            vision_mask[1] = last_mask_end
+        last_mask_end = vision_mask[1]
+    return vision_masks
+def convert_sparse_cross_attention_mask_to_dense(
+    cross_attention_token_mask: List[List[List[int]]],
+    num_tiles: List[List[int]],
+    max_num_tiles: int,
+    length: int,
+) -> np.ndarray:
+    """
+    Convert the cross attention mask indices to a cross attention mask 4D array.
+    This function takes a sparse representation of cross attention masks and converts it to a dense 4D numpy array.
+    The sparse representation is a nested list structure that defines attention ranges for each image in each batch item.
+    Args:
+        cross_attention_token_mask (List[List[List[int]]]): A nested list structure where:
+            - The outer list represents the batch dimension.
+            - The middle list represents different images within each batch item.
+            - The inner list contains pairs of integers [start, end] representing token ranges for each image.
+        num_tiles (List[List[int]]): A nested list structure specifying the number of tiles for each image in each batch item.
+        max_num_tiles (int): The maximum possible number of tiles.
+        length (int): The total sequence length of the input.
+    Returns:
+        np.ndarray: A 4D numpy array of shape (batch_size, length, max_num_images, max_num_tiles)
+            The array contains `1` where attention is allowed and `0` where it is not.
+    Note:
+        - Special handling is done for cases where the end token is -1, which is interpreted as attending to the end of the sequence.
+    """
+    batch_size = len(cross_attention_token_mask)
+    max_num_images = max([len(masks) for masks in cross_attention_token_mask])
+    cross_attention_mask = np.zeros(
+        shape=(batch_size, length, max_num_images, max_num_tiles),
+        dtype=np.int64,
+    )
+    for sample_idx, (sample_masks, sample_num_tiles) in enumerate(zip(cross_attention_token_mask, num_tiles)):
+        for mask_idx, (locations, mask_num_tiles) in enumerate(zip(sample_masks, sample_num_tiles)):
+            if len(locations) == 2:
+                start, end = locations
+                end = min(end, length)
+                if end == -1:
+                    end = length
+                cross_attention_mask[sample_idx, start:end, mask_idx, :mask_num_tiles] = 1
+    return cross_attention_mask
+def build_string_from_input(prompt: str, bos_token: str, image_token: str) -> str:
+    """
+    Builds a string from the input prompt by adding `bos_token` if not already present.
+    Args:
+        prompt (`str`):
+            The input prompt string.
+        bos_token (`str`):
+            The beginning of sentence token to be added.
+        image_token (`str`):
+            The image token used to identify the start of an image sequence.
+    Returns:
+        str: The modified prompt string with the `bos_token` added if necessary.
+    Examples:
+        >>> build_string_from_input("Hello world", "<begin_of_text>", "<|image|>")
+        '<begin_of_text>Hello world'
+        >>> build_string_from_input("<|image|>Hello world", "<begin_of_text>", "<|image|>")
+        '<|image|><begin_of_text>Hello world'
+        >>> build_string_from_input("<begin_of_text>Hello world", "<begin_of_text>", "<|image|>")
+        '<begin_of_text>Hello world'
+    """
+    if bos_token in prompt:
+        return prompt
+    num_image_tokens_on_start = 0
+    while prompt.startswith(image_token):
+        prompt = prompt[len(image_token) :]
+        num_image_tokens_on_start += 1
+    return f"{image_token * num_image_tokens_on_start}{bos_token}{prompt}"
+class MllamaProcessor(ProcessorMixin):
+    r"""
+    Constructs a Mllama processor which wraps [`MllamaImageProcessor`] and
+    [`PretrainedTokenizerFast`] into a single processor that inherits both the image processor and
+    tokenizer functionalities. See the [`~MllamaProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more
+    information.
+    The preferred way of passing kwargs is as a dictionary per modality, see usage example below.
+        ```python
+        from transformers import MllamaProcessor
+        from PIL import Image
+        processor = MllamaProcessor.from_pretrained("meta-llama/Llama-3.2-11B-Vision")
+        processor(
+            images=your_pil_image,
+            text=["<|image|>If I had to write a haiku for this one"],
+            images_kwargs = {"size": {"height": 448, "width": 448}},
+            text_kwargs = {"padding": "right"},
+            common_kwargs = {"return_tensors": "pt"},
+        )
+        ```
+    Args:
+        image_processor ([`MllamaImageProcessor`]):
+            The image processor is a required input.
+        tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`]):
+            The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "audio_processor", "tokenizer"]
+    image_processor_class = "MllamaImageProcessor"
+    audio_processor_class = "MllamaAudioFeatureExtractor"
+    tokenizer_class = "PreTrainedTokenizerFast"
+    def __init__(self, image_processor, audio_processor, tokenizer):
+        self.image_token = "<|image|>"
+        self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
+        self.audio_token = "<|audio|>"
+        self.audio_token_id = tokenizer.convert_tokens_to_ids(self.audio_token)
+        self.python_token = "<|python_tag|>"
+        self.python_token_id = tokenizer.convert_tokens_to_ids(self.python_token)
+        self.bos_token = tokenizer.bos_token
+        self.chat_template = tokenizer.chat_template
+        super().__init__(image_processor, audio_processor, tokenizer)
+        self.tokenizer.add_tokens([f"<|audio_{i}|>" for i in range(1, 50)])
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+        audio: Optional[Union[AudioInput, List[AudioInput]]] = None,
+        videos=None,
+        **kwargs: Unpack[MllamaProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare text(s) and image(s) to be fed as input to the model. This method forwards the `text`
+        arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` arguments to
+        MllamaImageProcessor's [`~MllamaImageProcessor.__call__`] if `images` is not `None`. Please refer
+        to the docstring of the above two methods for more information.
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                    - `'tf'`: Return TensorFlow `tf.constant` objects.
+                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                    - `'np'`: Return NumPy `np.ndarray` objects.
+                    - `'jax'`: Return JAX `jnp.ndarray` objects.
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **audio_features** -- Audio features extracted using SeamlessM4TFeatureExtractor. Returned when `audio` is not `None`.
+            TODO: add aspect_ratio_ids and aspect_ratio_mask and cross_attention_mask
+        """
+        if text is None:
+            raise ValueError("You must specify text.")
+        output_kwargs = self._merge_kwargs(
+            MllamaProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        text_kwargs = output_kwargs["text_kwargs"]
+        images_kwargs = output_kwargs["images_kwargs"]
+        common_kwargs = output_kwargs["common_kwargs"]
+        data = {}
+        if audio is not None:
+            audio_features = self.audio_processor(audio)
+            data.update(audio_features)
+        if isinstance(text, str):
+            text = [text]
+        elif not (isinstance(text, (list, tuple)) and all(isinstance(t, str) for t in text)):
+            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+        n_images_in_text = [t.count(self.image_token) for t in text]
+        text = [build_string_from_input(text_item, self.bos_token, self.image_token) for text_item in text]
+        _ = text_kwargs.pop("padding_side", None)  # hack until padding-side is an accepted kwarg by tokenizers
+        if audio is not None:
+            text = build_audio_tokens(text, audio_features, self.audio_token)
+        encoding = self.tokenizer(text, add_special_tokens=False, **text_kwargs)
+        if audio is not None:
+            beg_audio_id = self.tokenizer.convert_tokens_to_ids("<|audio_1|>")
+            idx = torch.where(encoding['input_ids'] >= beg_audio_id)
+            encoding['input_ids'][idx] = beg_audio_id - encoding['input_ids'][idx] - 1
+        data.update(encoding)
+        n_images_in_images = [0]
+        if images is not None:
+            images = make_list_of_images(images)
+            n_images_in_images = [len(sample) for sample in images]
+        if text is not None:
+            if any(batch_img == 0 for batch_img in n_images_in_text) and not all(
+                batch_img == 0 for batch_img in n_images_in_text
+            ):
+                raise ValueError(
+                    "If a batch of text is provided, there should be either no images or at least one image per sample"
+                )
+            if sum(n_images_in_images) != sum(n_images_in_text):
+                if images is None:
+                    raise ValueError("No image were provided, but there are image tokens in the prompt")
+                else:
+                    raise ValueError(
+                        f"The number of image token ({sum(n_images_in_text)}) should be the same as in the number of provided images ({sum(n_images_in_images)})"
+                    )
+        if images is not None:
+            image_features = self.image_processor(images, **images_kwargs)
+            num_tiles = image_features.pop("num_tiles")
+            data.update(image_features)
+        # Create cross attention mask
+        if images is not None and text is not None:
+            cross_attention_token_mask = [
+                get_cross_attention_token_mask(token_ids, self.image_token_id) for token_ids in encoding["input_ids"]
+            ]
+            cross_attention_mask = convert_sparse_cross_attention_mask_to_dense(
+                cross_attention_token_mask,
+                num_tiles=num_tiles,
+                max_num_tiles=self.image_processor.max_image_tiles,
+                length=max(len(input_ids) for input_ids in encoding["input_ids"]),
+            )
+            data["cross_attention_mask"] = cross_attention_mask
+        return_tensors = common_kwargs.pop("return_tensors", None)
+        batch_feature = BatchFeature(data=data, tensor_type=return_tensors)
+        return batch_feature
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        audio_processor_input_names = self.audio_processor.model_input_names
+        return list(tokenizer_input_names +
+                    image_processor_input_names +
+                    ["cross_attention_mask"] +
+                    audio_processor_input_names)
+AutoProcessor.register("MllamaProcessor", MllamaProcessor)
+transformers.MllamaProcessor = MllamaProcessor