Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

.gitattributes +1 -0
config.json +8 -0
mexma_siglip.py +86 -0
model.safetensors +3 -0
preprocessor_config.json +24 -0
special_tokens_map.json +15 -0
tokenizer.json +3 -0
tokenizer_config.json +54 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "architectures": [
+    "MexmaSigLIP"
+  ],
+  "optimized": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.46.3"
+}

mexma_siglip.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import numpy as np
+import torch
+import torch.nn.functional as F
+from transformers import (
+    PretrainedConfig,
+    PreTrainedModel,
+    SiglipVisionConfig,
+    SiglipVisionModel,
+    XLMRobertaConfig,
+    XLMRobertaModel,
+)
+class MexmaSigLIPConfig(PretrainedConfig):
+    def __init__(
+        self,
+        optimized: bool = False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.optimized = optimized
+class MexmaSigLIP(PreTrainedModel):
+    config_class = MexmaSigLIPConfig
+    def __init__(self, config: MexmaSigLIPConfig):
+        super().__init__(config)
+        self.config = config
+        text_config = XLMRobertaConfig.from_pretrained("facebook/MEXMA")
+        if self.config.optimized:
+            text_config._attn_implementation = "sdpa"
+        self.text_model = XLMRobertaModel(text_config, add_pooling_layer=False)
+        self.text_projector = torch.nn.Linear(1024, 1152, bias=False)
+        vision_congig = SiglipVisionConfig.from_pretrained(
+            "google/siglip-so400m-patch14-384"
+        )
+        if self.config.optimized:
+            vision_congig._attn_implementation = "flash_attention_2"
+        self.vision_model = SiglipVisionModel(vision_congig).vision_model
+        self.logit_scale = torch.nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.logit_bias = torch.nn.Parameter(torch.ones([]) * -10)
+    def forward(self, image_inputs, input_ids, attention_mask, normalize=False):
+        text_features = self.encode_texts(input_ids, attention_mask, normalize)
+        image_features = self.encode_images(image_inputs, normalize)
+        return {
+            "image_features": image_features,
+            "text_features": text_features,
+            "logit_scale": self.logit_scale,
+            "logit_bias": self.logit_bias,
+        }
+    def encode_images(
+        self,
+        pixel_values,
+        normalize=False,
+    ):
+        features = self.vision_model(pixel_values).pooler_output
+        return F.normalize(features, dim=-1) if normalize else features
+    def encode_texts(
+        self,
+        input_ids,
+        attention_mask,
+        normalize=False,
+    ):
+        features = self.text_model(
+            input_ids=input_ids, attention_mask=attention_mask
+        ).last_hidden_state[:, 0]
+        features = self.text_projector(features)
+        return F.normalize(features, dim=-1) if normalize else features
+    def get_logits(
+        self,
+        input_ids,
+        attention_mask,
+        pixel_values,
+    ):
+        image_features = self.encode_images(pixel_values, normalize=True)
+        text_features = self.encode_texts(input_ids, attention_mask, normalize=True)
+        image_logits = (
+            self.logit_scale.exp() * image_features @ text_features.T + self.logit_bias
+        )
+        text_logits = image_logits.T
+        return image_logits, text_logits

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:419bd95e6ccccc504a98cdf5c8887d4feb4afe1dcdc785a3ebfab44bac7c644e
+size 3953089552

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "do_convert_rgb": null,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "SiglipImageProcessor",
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "processor_class": "SiglipProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 384,
+    "width": 384
+  }
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "unk_token": "<unk>"
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a56def25aa40facc030ea8b0b87f3688e4b3c39eb8b45d5702b3a1300fe2a20
+size 17082734

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,54 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "250001": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": "<mask>",
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "XLMRobertaTokenizer",
+  "unk_token": "<unk>"
+}