Upload model

Browse files

Files changed (4) hide show

config.json +179 -0
configuration_veld.py +129 -0
modeling_veld.py +0 -0
pytorch_model.bin +3 -0

config.json ADDED Viewed

	@@ -0,0 +1,179 @@

+{
+  "_commit_hash": null,
+  "_name_or_path": "checkpoints/veld_e1_linear",
+  "architectures": [
+    "VELDModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_veld.VELDConfig",
+    "AutoModel": "modeling_veld.VELDModel"
+  },
+  "decoder": {
+    "_name_or_path": "KETI-AIR/ke-t5-base",
+    "add_cross_attention": true,
+    "architectures": [
+      "T5DualDecoderDoubleHeadsModel"
+    ],
+    "bad_words_ids": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "d_ff": 2048,
+    "d_kv": 64,
+    "d_model": 768,
+    "decoder_start_token_id": 0,
+    "dense_act_fn": "gelu_new",
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout_rate": 0.1,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 1,
+    "exponential_decay_length_penalty": null,
+    "feed_forward_proj": "gated-gelu",
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_factor": 1.0,
+    "is_decoder": true,
+    "is_encoder_decoder": false,
+    "is_gated_act": true,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_epsilon": 1e-06,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "t5",
+    "n_positions": 512,
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_decoder_layers": 12,
+    "num_heads": 12,
+    "num_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 0,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "relative_attention_max_distance": 128,
+    "relative_attention_num_buckets": 32,
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.22.1",
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_cache": true,
+    "vocab_size": 64128
+  },
+  "encoder": {
+    "_name_or_path": "google/vit-base-patch16-384",
+    "add_cross_attention": false,
+    "architectures": [
+      "ViTForImageClassification"
+    ],
+    "attention_probs_dropout_prob": 0.0,
+    "bad_words_ids": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "encoder_stride": 16,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.0,
+    "hidden_size": 768,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_size": 384,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-12,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "vit",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 12,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_channels": 3,
+    "num_hidden_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 16,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "qkv_bias": true,
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.22.1",
+    "typical_p": 1.0,
+    "use_bfloat16": false
+  },
+  "eos_token_id": 1,
+  "is_encoder_decoder": true,
+  "model_type": "veld",
+  "num_queries_global": 1,
+  "num_queries_local": 256,
+  "pad_token_id": 0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": null
+}

configuration_veld.py ADDED Viewed

	@@ -0,0 +1,129 @@

+# coding=utf-8
+# Copyright 2022, The T5 Authors and HuggingFace Inc, san kim.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" vision-encoder-language-decoder-t5 model configuration"""
+import copy
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from transformers.models.auto.configuration_auto import AutoConfig
+from transformers import T5Config, ViTConfig
+logger = logging.get_logger(__name__)
+class VELDConfig(PretrainedConfig):
+    r"""
+    [`VELDConfig`] is the configuration class to store the configuration of a
+    [`VELDConfig`]. It is used to instantiate a Vision-Encoder-Text-Decoder model according to the
+    specified arguments, defining the encoder and decoder configs.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        kwargs (*optional*):
+            Dictionary of keyword arguments. Notably:
+                - **encoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that defines
+                  the encoder config.
+                - **decoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that defines
+                  the decoder config.
+    Examples:
+    ```python
+    >>> from transformers import T5Config, ViTConfig
+    >>> from configuration_veld import VELDConfig
+    >>> from modeling_veld import VELDModel
+    >>> # Initializing a ViT & T5 style configuration
+    >>> config_encoder = ViTConfig()
+    >>> config_decoder = T5Config()
+    >>> config = VELDConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
+    >>> # Initializing a ViTBert model from a ViT & bert-base-uncased style configurations
+    >>> model = VELDModel(config=config)
+    >>> # Accessing the model configuration
+    >>> config_encoder = model.config.encoder
+    >>> config_decoder = model.config.decoder
+    >>> # set decoder config to causal lm
+    >>> config_decoder.is_decoder = True
+    >>> config_decoder.add_cross_attention = True
+    >>> # Saving the model, including its configuration
+    >>> model.save_pretrained("my-model")
+    >>> # loading model and config from pretrained folder
+    >>> encoder_decoder_config = VELDConfig.from_pretrained("my-model")
+    >>> model = VELDModel.from_pretrained("my-model", config=encoder_decoder_config)
+    ```"""
+    model_type = "veld"
+    is_composition = True
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        if "encoder" not in kwargs or "decoder" not in kwargs:
+            raise ValueError(
+                f"A configuraton of type {self.model_type} cannot be instantiated because "
+                f"not both `encoder` and `decoder` sub-configurations are passed, but only {kwargs}"
+            )
+        encoder_config = kwargs.pop("encoder")
+        encoder_model_type = encoder_config.pop("model_type")
+        decoder_config = kwargs.pop("decoder")
+        decoder_model_type = decoder_config.pop("model_type")
+        self.encoder = ViTConfig(**encoder_config)
+        self.decoder = T5Config(**decoder_config)
+        self.is_encoder_decoder = True
+        self.pad_token_id=self.decoder.pad_token_id
+        self.eos_token_id=self.decoder.eos_token_id
+        self.num_queries_global = getattr(kwargs, "num_queries_global", 1)
+        self.num_queries_local = getattr(kwargs, "num_queries_local", 256)
+    @classmethod
+    def from_encoder_decoder_configs(
+        cls, encoder_config: PretrainedConfig, decoder_config: T5Config, **kwargs
+    ) -> PretrainedConfig:
+        r"""
+        Instantiate a [`VELDConfig`] (or a derived class) from a pre-trained encoder model
+        configuration and decoder model configuration.
+        Returns:
+            [`VELDConfig`]: An instance of a configuration object
+        """
+        logger.info("Setting `config.is_decoder=True` and `config.is_encoder_decoder=False` for decoder_config")
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        return cls(encoder=encoder_config.to_dict(), decoder=decoder_config.to_dict(), **kwargs)
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default *to_dict()* from *PretrainedConfig*.
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["encoder"] = self.encoder.to_dict()
+        output["decoder"] = self.decoder.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output

modeling_veld.py ADDED Viewed

The diff for this file is too large to render. See raw diff

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c0f321d19a471b793b277694b2adf577c807c7b35f087ea2b89669b74feb5467
+size 1354141353