kimsan0622 commited on
Commit
b2024af
1 Parent(s): a9888f7

Upload model

Browse files
Files changed (4) hide show
  1. config.json +179 -0
  2. configuration_veld.py +129 -0
  3. modeling_veld.py +0 -0
  4. pytorch_model.bin +3 -0
config.json ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": null,
3
+ "_name_or_path": "checkpoints/veld_e1_linear",
4
+ "architectures": [
5
+ "VELDModel"
6
+ ],
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_veld.VELDConfig",
9
+ "AutoModel": "modeling_veld.VELDModel"
10
+ },
11
+ "decoder": {
12
+ "_name_or_path": "KETI-AIR/ke-t5-base",
13
+ "add_cross_attention": true,
14
+ "architectures": [
15
+ "T5DualDecoderDoubleHeadsModel"
16
+ ],
17
+ "bad_words_ids": null,
18
+ "bos_token_id": null,
19
+ "chunk_size_feed_forward": 0,
20
+ "cross_attention_hidden_size": null,
21
+ "d_ff": 2048,
22
+ "d_kv": 64,
23
+ "d_model": 768,
24
+ "decoder_start_token_id": 0,
25
+ "dense_act_fn": "gelu_new",
26
+ "diversity_penalty": 0.0,
27
+ "do_sample": false,
28
+ "dropout_rate": 0.1,
29
+ "early_stopping": false,
30
+ "encoder_no_repeat_ngram_size": 0,
31
+ "eos_token_id": 1,
32
+ "exponential_decay_length_penalty": null,
33
+ "feed_forward_proj": "gated-gelu",
34
+ "finetuning_task": null,
35
+ "forced_bos_token_id": null,
36
+ "forced_eos_token_id": null,
37
+ "id2label": {
38
+ "0": "LABEL_0",
39
+ "1": "LABEL_1"
40
+ },
41
+ "initializer_factor": 1.0,
42
+ "is_decoder": true,
43
+ "is_encoder_decoder": false,
44
+ "is_gated_act": true,
45
+ "label2id": {
46
+ "LABEL_0": 0,
47
+ "LABEL_1": 1
48
+ },
49
+ "layer_norm_epsilon": 1e-06,
50
+ "length_penalty": 1.0,
51
+ "max_length": 20,
52
+ "min_length": 0,
53
+ "model_type": "t5",
54
+ "n_positions": 512,
55
+ "no_repeat_ngram_size": 0,
56
+ "num_beam_groups": 1,
57
+ "num_beams": 1,
58
+ "num_decoder_layers": 12,
59
+ "num_heads": 12,
60
+ "num_layers": 12,
61
+ "num_return_sequences": 1,
62
+ "output_attentions": false,
63
+ "output_hidden_states": false,
64
+ "output_scores": false,
65
+ "pad_token_id": 0,
66
+ "prefix": null,
67
+ "problem_type": null,
68
+ "pruned_heads": {},
69
+ "relative_attention_max_distance": 128,
70
+ "relative_attention_num_buckets": 32,
71
+ "remove_invalid_values": false,
72
+ "repetition_penalty": 1.0,
73
+ "return_dict": true,
74
+ "return_dict_in_generate": false,
75
+ "sep_token_id": null,
76
+ "task_specific_params": null,
77
+ "temperature": 1.0,
78
+ "tf_legacy_loss": false,
79
+ "tie_encoder_decoder": false,
80
+ "tie_word_embeddings": true,
81
+ "tokenizer_class": null,
82
+ "top_k": 50,
83
+ "top_p": 1.0,
84
+ "torch_dtype": null,
85
+ "torchscript": false,
86
+ "transformers_version": "4.22.1",
87
+ "typical_p": 1.0,
88
+ "use_bfloat16": false,
89
+ "use_cache": true,
90
+ "vocab_size": 64128
91
+ },
92
+ "encoder": {
93
+ "_name_or_path": "google/vit-base-patch16-384",
94
+ "add_cross_attention": false,
95
+ "architectures": [
96
+ "ViTForImageClassification"
97
+ ],
98
+ "attention_probs_dropout_prob": 0.0,
99
+ "bad_words_ids": null,
100
+ "bos_token_id": null,
101
+ "chunk_size_feed_forward": 0,
102
+ "cross_attention_hidden_size": null,
103
+ "decoder_start_token_id": null,
104
+ "diversity_penalty": 0.0,
105
+ "do_sample": false,
106
+ "early_stopping": false,
107
+ "encoder_no_repeat_ngram_size": 0,
108
+ "encoder_stride": 16,
109
+ "eos_token_id": null,
110
+ "exponential_decay_length_penalty": null,
111
+ "finetuning_task": null,
112
+ "forced_bos_token_id": null,
113
+ "forced_eos_token_id": null,
114
+ "hidden_act": "gelu",
115
+ "hidden_dropout_prob": 0.0,
116
+ "hidden_size": 768,
117
+ "id2label": {
118
+ "0": "LABEL_0",
119
+ "1": "LABEL_1"
120
+ },
121
+ "image_size": 384,
122
+ "initializer_range": 0.02,
123
+ "intermediate_size": 3072,
124
+ "is_decoder": false,
125
+ "is_encoder_decoder": false,
126
+ "label2id": {
127
+ "LABEL_0": 0,
128
+ "LABEL_1": 1
129
+ },
130
+ "layer_norm_eps": 1e-12,
131
+ "length_penalty": 1.0,
132
+ "max_length": 20,
133
+ "min_length": 0,
134
+ "model_type": "vit",
135
+ "no_repeat_ngram_size": 0,
136
+ "num_attention_heads": 12,
137
+ "num_beam_groups": 1,
138
+ "num_beams": 1,
139
+ "num_channels": 3,
140
+ "num_hidden_layers": 12,
141
+ "num_return_sequences": 1,
142
+ "output_attentions": false,
143
+ "output_hidden_states": false,
144
+ "output_scores": false,
145
+ "pad_token_id": null,
146
+ "patch_size": 16,
147
+ "prefix": null,
148
+ "problem_type": null,
149
+ "pruned_heads": {},
150
+ "qkv_bias": true,
151
+ "remove_invalid_values": false,
152
+ "repetition_penalty": 1.0,
153
+ "return_dict": true,
154
+ "return_dict_in_generate": false,
155
+ "sep_token_id": null,
156
+ "task_specific_params": null,
157
+ "temperature": 1.0,
158
+ "tf_legacy_loss": false,
159
+ "tie_encoder_decoder": false,
160
+ "tie_word_embeddings": true,
161
+ "tokenizer_class": null,
162
+ "top_k": 50,
163
+ "top_p": 1.0,
164
+ "torch_dtype": null,
165
+ "torchscript": false,
166
+ "transformers_version": "4.22.1",
167
+ "typical_p": 1.0,
168
+ "use_bfloat16": false
169
+ },
170
+ "eos_token_id": 1,
171
+ "is_encoder_decoder": true,
172
+ "model_type": "veld",
173
+ "num_queries_global": 1,
174
+ "num_queries_local": 256,
175
+ "pad_token_id": 0,
176
+ "tie_word_embeddings": false,
177
+ "torch_dtype": "float32",
178
+ "transformers_version": null
179
+ }
configuration_veld.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022, The T5 Authors and HuggingFace Inc, san kim.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ vision-encoder-language-decoder-t5 model configuration"""
16
+ import copy
17
+
18
+ from transformers.configuration_utils import PretrainedConfig
19
+ from transformers.utils import logging
20
+ from transformers.models.auto.configuration_auto import AutoConfig
21
+ from transformers import T5Config, ViTConfig
22
+
23
+
24
+ logger = logging.get_logger(__name__)
25
+
26
+ class VELDConfig(PretrainedConfig):
27
+ r"""
28
+ [`VELDConfig`] is the configuration class to store the configuration of a
29
+ [`VELDConfig`]. It is used to instantiate a Vision-Encoder-Text-Decoder model according to the
30
+ specified arguments, defining the encoder and decoder configs.
31
+
32
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
33
+ documentation from [`PretrainedConfig`] for more information.
34
+
35
+ Args:
36
+ kwargs (*optional*):
37
+ Dictionary of keyword arguments. Notably:
38
+
39
+ - **encoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that defines
40
+ the encoder config.
41
+ - **decoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that defines
42
+ the decoder config.
43
+
44
+ Examples:
45
+
46
+ ```python
47
+ >>> from transformers import T5Config, ViTConfig
48
+ >>> from configuration_veld import VELDConfig
49
+ >>> from modeling_veld import VELDModel
50
+
51
+ >>> # Initializing a ViT & T5 style configuration
52
+ >>> config_encoder = ViTConfig()
53
+ >>> config_decoder = T5Config()
54
+
55
+ >>> config = VELDConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
56
+
57
+ >>> # Initializing a ViTBert model from a ViT & bert-base-uncased style configurations
58
+ >>> model = VELDModel(config=config)
59
+
60
+ >>> # Accessing the model configuration
61
+ >>> config_encoder = model.config.encoder
62
+ >>> config_decoder = model.config.decoder
63
+ >>> # set decoder config to causal lm
64
+ >>> config_decoder.is_decoder = True
65
+ >>> config_decoder.add_cross_attention = True
66
+
67
+ >>> # Saving the model, including its configuration
68
+ >>> model.save_pretrained("my-model")
69
+
70
+ >>> # loading model and config from pretrained folder
71
+ >>> encoder_decoder_config = VELDConfig.from_pretrained("my-model")
72
+ >>> model = VELDModel.from_pretrained("my-model", config=encoder_decoder_config)
73
+ ```"""
74
+ model_type = "veld"
75
+ is_composition = True
76
+
77
+ def __init__(self, **kwargs):
78
+ super().__init__(**kwargs)
79
+ if "encoder" not in kwargs or "decoder" not in kwargs:
80
+ raise ValueError(
81
+ f"A configuraton of type {self.model_type} cannot be instantiated because "
82
+ f"not both `encoder` and `decoder` sub-configurations are passed, but only {kwargs}"
83
+ )
84
+
85
+ encoder_config = kwargs.pop("encoder")
86
+ encoder_model_type = encoder_config.pop("model_type")
87
+ decoder_config = kwargs.pop("decoder")
88
+ decoder_model_type = decoder_config.pop("model_type")
89
+
90
+ self.encoder = ViTConfig(**encoder_config)
91
+ self.decoder = T5Config(**decoder_config)
92
+ self.is_encoder_decoder = True
93
+
94
+ self.pad_token_id=self.decoder.pad_token_id
95
+ self.eos_token_id=self.decoder.eos_token_id
96
+
97
+ self.num_queries_global = getattr(kwargs, "num_queries_global", 1)
98
+ self.num_queries_local = getattr(kwargs, "num_queries_local", 256)
99
+
100
+
101
+ @classmethod
102
+ def from_encoder_decoder_configs(
103
+ cls, encoder_config: PretrainedConfig, decoder_config: T5Config, **kwargs
104
+ ) -> PretrainedConfig:
105
+ r"""
106
+ Instantiate a [`VELDConfig`] (or a derived class) from a pre-trained encoder model
107
+ configuration and decoder model configuration.
108
+
109
+ Returns:
110
+ [`VELDConfig`]: An instance of a configuration object
111
+ """
112
+ logger.info("Setting `config.is_decoder=True` and `config.is_encoder_decoder=False` for decoder_config")
113
+ decoder_config.is_decoder = True
114
+ decoder_config.is_encoder_decoder = False
115
+
116
+ return cls(encoder=encoder_config.to_dict(), decoder=decoder_config.to_dict(), **kwargs)
117
+
118
+ def to_dict(self):
119
+ """
120
+ Serializes this instance to a Python dictionary. Override the default *to_dict()* from *PretrainedConfig*.
121
+
122
+ Returns:
123
+ `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
124
+ """
125
+ output = copy.deepcopy(self.__dict__)
126
+ output["encoder"] = self.encoder.to_dict()
127
+ output["decoder"] = self.decoder.to_dict()
128
+ output["model_type"] = self.__class__.model_type
129
+ return output
modeling_veld.py ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0f321d19a471b793b277694b2adf577c807c7b35f087ea2b89669b74feb5467
3
+ size 1354141353