ErfanMoosaviMonazzah commited on
Commit
71bb029
1 Parent(s): 0664fac

Upload model

Browse files
config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "BackpackGPT2NLIModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_backpack_gpt2_nli.BackpackGPT2NLIConfig",
9
+ "AutoModelForSequenceClassification": "modeling_backpack_gpt2_nli.BackpackGPT2NLIModel"
10
+ },
11
+ "bos_token_id": 50256,
12
+ "embd_pdrop": 0.1,
13
+ "eos_token_id": 50256,
14
+ "freeze_backpack": true,
15
+ "id2label": {
16
+ "0": "LABEL_0",
17
+ "1": "LABEL_1",
18
+ "2": "LABEL_2"
19
+ },
20
+ "initializer_range": 0.02,
21
+ "label2id": {
22
+ "LABEL_0": 0,
23
+ "LABEL_1": 1,
24
+ "LABEL_2": 2
25
+ },
26
+ "layer_norm_epsilon": 1e-05,
27
+ "model_type": "gpt2",
28
+ "n_embd": 768,
29
+ "n_head": 12,
30
+ "n_inner": null,
31
+ "n_layer": 12,
32
+ "n_positions": 512,
33
+ "num_senses": 16,
34
+ "reorder_and_upcast_attn": false,
35
+ "resid_pdrop": 0.1,
36
+ "scale_attn_by_inverse_layer_idx": true,
37
+ "scale_attn_weights": true,
38
+ "sense_intermediate_scale": 4,
39
+ "summary_activation": null,
40
+ "summary_first_dropout": 0.1,
41
+ "summary_proj_to_labels": true,
42
+ "summary_type": "cls_index",
43
+ "summary_use_proj": true,
44
+ "torch_dtype": "float32",
45
+ "transformers_version": "4.31.0",
46
+ "use_cache": true,
47
+ "vocab_size": 50264
48
+ }
configuration_backpack_gpt2.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.models.gpt2.configuration_gpt2 import GPT2Config
2
+
3
+ class BackpackGPT2Config(GPT2Config):
4
+ """
5
+ This is the configuration class to store the configuration of a [`GPT2Model`] or a [`TFGPT2Model`]. It is used to
6
+ instantiate a Backpack GPT-2 model according to the specified arguments, defining the model architecture.
7
+
8
+ Configuration objects inherit from [`GPT2Config`] and can be used to control the model outputs. Read the
9
+ documentation from [`GPT2Config`] for more information.
10
+
11
+ Args:
12
+ num_senses (`int`, *optional*, defaults to 16):
13
+ The number of sense vectors to define for each word.
14
+ sense_intermediate_scale (`int`, *optional*, defaults ot 4):
15
+ The hidden dimensionality of the sense vector network.
16
+
17
+ Example:
18
+
19
+ ```python
20
+ >>> from transformers import BackpackGPT2Config, BackpackGPT2Model
21
+
22
+ >>> # Initializing a GPT2 configuration
23
+ >>> configuration = BackpackGPT2Config()
24
+
25
+ >>> # Initializing a model (with random weights) from the configuration
26
+ >>> model = BackpackGPT2Model(configuration)
27
+
28
+ >>> # Accessing the model configuration
29
+ >>> configuration = model.config
30
+ """
31
+
32
+ def __init__(self,
33
+ vocab_size=50264,
34
+ num_senses=16,
35
+ sense_intermediate_scale=4,
36
+ n_positions=512,
37
+ scale_attn_by_inverse_layer_idx=True,
38
+ **kwargs,
39
+ ):
40
+ self.num_senses = num_senses
41
+ self.sense_intermediate_scale = sense_intermediate_scale
42
+ super().__init__(vocab_size=vocab_size, n_positions=n_positions, scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx, **kwargs)
configuration_backpack_gpt2_nli.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.models.gpt2.configuration_gpt2 import GPT2Config
2
+
3
+ class BackpackGPT2NLIConfig(GPT2Config):
4
+ def __init__(self,
5
+ freeze_backpack=True,
6
+ num_labels=3,
7
+ vocab_size=50264,
8
+ num_senses=16,
9
+ sense_intermediate_scale=4,
10
+ n_positions=512,
11
+ scale_attn_by_inverse_layer_idx=True,
12
+ **kwargs,
13
+ ):
14
+ self.freeze_backpack = freeze_backpack
15
+ #self.num_labels = num_labels
16
+
17
+ self.num_senses = num_senses
18
+ self.sense_intermediate_scale = sense_intermediate_scale
19
+ super().__init__(vocab_size=vocab_size, n_positions=n_positions, scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx, num_labels=num_labels, **kwargs)
modeling_backpack_gpt2.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from dataclasses import dataclass
3
+ from typing import Optional, Tuple
4
+
5
+ import torch
6
+ import torch.utils.checkpoint
7
+ from torch import nn
8
+
9
+ from transformers.activations import ACT2FN
10
+ from transformers.pytorch_utils import Conv1D
11
+ from transformers.utils import (
12
+ ModelOutput,
13
+ logging,
14
+ )
15
+ from transformers.models.gpt2.modeling_gpt2 import GPT2Model, GPT2PreTrainedModel
16
+ from .configuration_backpack_gpt2 import BackpackGPT2Config # From the same directory
17
+
18
+ logger = logging.get_logger(__name__)
19
+
20
+
21
+ ### Backpack-Specific
22
+ class BackpackGPT2PreTrainedModel(GPT2PreTrainedModel):
23
+ """
24
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
25
+ models.
26
+ """
27
+ _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias"]
28
+
29
+ config_class = BackpackGPT2Config
30
+ base_model_prefix = "backpack"
31
+ is_parallelizable = True
32
+ supports_gradient_checkpointing = False
33
+ _no_split_modules = ["GPT2Block", "BackpackNoMixBlock"]
34
+
35
+ def __init__(self, *inputs, **kwargs):
36
+ super().__init__(*inputs, **kwargs)
37
+
38
+ class BackpackMLP(nn.Module):
39
+
40
+ def __init__(self, embed_dim, intermediate_dim, out_dim, config):
41
+ super().__init__()
42
+ self.c_fc = Conv1D(intermediate_dim, embed_dim)
43
+ self.c_proj = Conv1D(out_dim, intermediate_dim)
44
+ self.act = ACT2FN[config.activation_function]
45
+ self.dropout = nn.Dropout(config.resid_pdrop)
46
+
47
+ def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
48
+ hidden_states = self.c_fc(hidden_states) # run order: 5
49
+ hidden_states = self.act(hidden_states)
50
+ hidden_states = self.c_proj(hidden_states)
51
+ hidden_states = self.dropout(hidden_states)
52
+ return hidden_states
53
+
54
+ class BackpackNoMixBlock(nn.Module):
55
+
56
+ def __init__(self, config):
57
+ super().__init__()
58
+ self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
59
+ self.ln_2 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
60
+ self.mlp = BackpackMLP(config.n_embd, config.n_embd*4, config.n_embd, config)
61
+ self.resid_dropout1 = nn.Dropout(config.resid_pdrop)
62
+ self.resid_dropout2 = nn.Dropout(config.resid_pdrop)
63
+
64
+ def forward(self, hidden_states, residual):
65
+ residual = self.resid_dropout1(hidden_states) + residual # run order: 4
66
+ hidden_states = self.ln_1(residual)
67
+ mlp_out = self.mlp(hidden_states)
68
+ residual = self.resid_dropout2(mlp_out) + residual
69
+ hidden_states = self.ln_2(residual)
70
+ return hidden_states
71
+
72
+ class BackpackSenseNetwork(nn.Module):
73
+ def __init__(self, config, num_senses, device=None, dtype=None):
74
+ super().__init__()
75
+ self.num_senses = num_senses # paper: k
76
+ #self.embeddings = embeddings
77
+ self.n_embd = config.n_embd # paper: d
78
+
79
+ self.dropout = nn.Dropout(config.embd_pdrop)
80
+ self.block = BackpackNoMixBlock(config)
81
+ self.ln = nn.LayerNorm(self.n_embd, eps=config.layer_norm_epsilon)
82
+ self.final_mlp = BackpackMLP(
83
+ embed_dim=config.n_embd,
84
+ intermediate_dim=config.sense_intermediate_scale*config.n_embd,
85
+ out_dim=config.n_embd*config.num_senses,
86
+ config=config,
87
+ )
88
+
89
+ def forward(self, input_embeds):
90
+ residual = self.dropout(input_embeds) # run order: 3
91
+ hidden_states = self.ln(residual)
92
+ hidden_states = self.block(hidden_states, residual)
93
+ senses = self.final_mlp(hidden_states)
94
+ bs, s, nvd = senses.shape
95
+ return senses.reshape(bs, s, self.num_senses, self.n_embd).transpose(1,2) # (bs, nv, s, d)
96
+
97
+ class BackpackWeightNetwork(nn.Module):
98
+
99
+ def __init__(self, num_senses, embed_dim):
100
+ super().__init__()
101
+ self.n_embd = embed_dim
102
+ self.num_senses = num_senses
103
+ self.embed_per_sense = embed_dim // num_senses
104
+ self.c_attn = nn.Linear(embed_dim, 2 * num_senses * self.embed_per_sense)
105
+ self.softmax_scale = None
106
+
107
+ def forward(self, encoded):
108
+ b, s, d = encoded.shape
109
+ encoded = self.c_attn(encoded) # (b, s, 2*d)
110
+ encoded = encoded.reshape(b, s, 2, self.num_senses, self.embed_per_sense) #(b, s, 2, nv, d//nv)
111
+ batch_size, seqlen = encoded.shape[0], encoded.shape[1]
112
+
113
+ # compute scores & mask
114
+ q, k = encoded.unbind(dim=2)
115
+ softmax_scale = self.softmax_scale or 1.0 / math.sqrt(q.shape[-1])
116
+ scores = torch.einsum('bthd,bshd->bhts', q, k * softmax_scale)
117
+ causal_mask = torch.triu(torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1)
118
+ scores = scores + causal_mask.to(dtype=scores.dtype)
119
+
120
+ return torch.softmax(scores, dim=-1, dtype=q.dtype)
121
+
122
+
123
+ @dataclass
124
+ class BackpackGPT2BaseModelOutput(ModelOutput):
125
+ hidden_states: torch.FloatTensor = None
126
+ contextualization: torch.FloatTensor = None
127
+
128
+ class BackpackGPT2Model(BackpackGPT2PreTrainedModel):
129
+ _keys_to_ignore_on_load_missing = [r".*attn.masked_bias", r".*attn.bias"]
130
+
131
+ def __init__(self, config):
132
+ super().__init__(config)
133
+
134
+ self.embed_dim = config.n_embd # paper: d
135
+ self.num_senses = config.num_senses # paper: k
136
+
137
+ self.gpt2_model = GPT2Model(config) # paper: parameterizing contextualization weights: Transformer Model
138
+
139
+ self.sense_network = BackpackSenseNetwork(config, self.num_senses, self.gpt2_model.wte) # paper: parametrizing senses
140
+ self.word_embeddings = self.gpt2_model.wte
141
+ self.position_embeddings = self.gpt2_model.wpe
142
+ self.sense_weight_net = BackpackWeightNetwork(self.num_senses, self.embed_dim) # paper: parameterizing contextualization weights
143
+ # Model parallel
144
+ self.model_parallel = False
145
+ self.device_map = None
146
+ self.gradient_checkpointing = False
147
+
148
+ def get_num_senses(self):
149
+ return self.num_senses
150
+
151
+ def get_word_embeddings(self):
152
+ return self.word_embeddings
153
+
154
+ def get_sense_network(self):
155
+ return self.sense_network
156
+
157
+ def forward(self, input_ids, position_ids):
158
+ # Compute senses
159
+ sense_input_embeds = self.word_embeddings(input_ids) # GPT2 word emb without pos # run order: 2
160
+ senses = self.sense_network(sense_input_embeds) # (bs, nv, s, d) (batch size, num toks, num senses, sense dim)
161
+
162
+ # Compute contextualization weights
163
+ contextl_hidden_states = self.gpt2_model(input_ids, position_ids=position_ids).last_hidden_state # (bs, s, d)
164
+ contextualization = self.sense_weight_net(contextl_hidden_states) # (bs, nv, s, s)
165
+
166
+ # Compute resulting outputs
167
+ hidden_states = torch.sum(contextualization @ senses, dim=1) # (bs, nv, s, d) -> (bs, s, d)
168
+ return BackpackGPT2BaseModelOutput(
169
+ hidden_states=hidden_states, # paper: o
170
+ contextualization=contextualization,
171
+ )
172
+
173
+ def run_with_custom_contextualization(self, input_ids, contextualization):
174
+ # Compute senses
175
+ sense_input_embeds = self.word_embeddings(input_ids)
176
+ senses = self.sense_network(sense_input_embeds) # (bs, nv, s, d)
177
+
178
+ # Compute resulting outputs
179
+ hidden_states = torch.sum(contextualization @ senses, dim=1) # (bs, nv, s, d) -> (bs, s, d)
180
+ return BackpackGPT2BaseModelOutput(
181
+ hidden_states=hidden_states,
182
+ contextualization=contextualization,
183
+ )
184
+
185
+ @dataclass
186
+ class BackpackGPT2LMHeadModelOutput(ModelOutput):
187
+ logits: torch.FloatTensor = None
188
+ contextualization: torch.FloatTensor = None
189
+
190
+ class BackpackGPT2LMHeadModel(BackpackGPT2PreTrainedModel):
191
+ _keys_to_ignore_on_load_missing = [r".*attn.masked_bias", r".*attn.bias"]
192
+
193
+ def __init__(self, config):
194
+ super().__init__(config)
195
+ self.backpack = BackpackGPT2Model(config)
196
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
197
+
198
+ # Model parallel
199
+ self.model_parallel = False
200
+ self.device_map = None
201
+
202
+ self.tie_weights()
203
+
204
+ def tie_weights(self):
205
+ self.lm_head.weight = self.backpack.word_embeddings.weight # also tied with the underlying underlying transf
206
+
207
+ def get_lm_head(self):
208
+ return self.lm_head
209
+
210
+ def forward(self, input_ids, position_ids=None): # run order: 1
211
+ outputs = self.backpack(input_ids, position_ids=position_ids)
212
+ hidden_states, contextualization = outputs.hidden_states, outputs.contextualization
213
+ lm_logits = self.lm_head(hidden_states) # (bs, s, V)
214
+ return BackpackGPT2LMHeadModelOutput(
215
+ logits=lm_logits,
216
+ contextualization=contextualization,
217
+ ) # you should return hidden_states as contextualized emb of input words in the specified sequence
218
+ # use these contextualized embeddings as input to classification head, actually you should write a new class similar to this one but
219
+ # use different output size for lm_head param (call it clf_head)
220
+
221
+ def run_with_custom_contextualization(self, input_ids, contextualization):
222
+ outputs = self.backpack.run_with_custom_contextualization(input_ids, contextualization)
223
+ hidden_states, contextualization = outputs.hidden_states, outputs.contextualization
224
+ lm_logits = self.lm_head(hidden_states)
225
+ return BackpackGPT2LMHeadModelOutput(
226
+ logits=lm_logits,
227
+ contextualization=contextualization,
228
+ )
229
+
modeling_backpack_gpt2_nli.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.utils.checkpoint
3
+ from torch import nn
4
+
5
+ from transformers.models.gpt2.modeling_gpt2 import GPT2PreTrainedModel
6
+ from .configuration_backpack_gpt2_nli import BackpackGPT2NLIConfig
7
+ from .modeling_backpack_gpt2 import BackpackGPT2Model
8
+
9
+
10
+ class BackpackGPT2NLIModel(GPT2PreTrainedModel):
11
+ config_class = BackpackGPT2NLIConfig
12
+
13
+ def __init__(self, config):
14
+ super().__init__(config)
15
+
16
+ self.backpack = BackpackGPT2Model(config)
17
+
18
+ self.n_embd = config.n_embd
19
+
20
+ self.num_labels = config.num_labels # 0: Entailment -- 1: Neutral -- 2: Contradiction
21
+
22
+ self.nli_head = nn.Sequential(
23
+ nn.Linear(self.n_embd, self.n_embd),
24
+ nn.Dropout(0.1),
25
+ nn.Linear(self.n_embd, self.num_labels)
26
+ )
27
+
28
+ # Freeze The Encoder if Needed
29
+ self.backpack.requires_grad_(not config.freeze_backpack)
30
+
31
+ self.loss_func = nn.CrossEntropyLoss()
32
+
33
+ # Model parallel
34
+ self.model_parallel = False
35
+
36
+
37
+ def forward(self, input_ids=None, attention_mask=None, labels=None):
38
+
39
+ backpack_outputs = self.backpack(input_ids=input_ids, position_ids=None)
40
+
41
+ backpack_hidden_states, backpack_contextualization = backpack_outputs.hidden_states, backpack_outputs.contextualization
42
+ last_toks_indices = attention_mask.shape[1] - 1 - attention_mask.flip((1,)).argmax(dim=1) # index of the last token of the input (according to att mask)
43
+ last_backpack_hidden_states = backpack_hidden_states[torch.arange(backpack_hidden_states.shape[0]), last_toks_indices, :]
44
+
45
+ logits = self.nli_head(last_backpack_hidden_states)
46
+
47
+ if labels is not None:
48
+ # Flatten the logits and labels, considering the attention mask
49
+ flat_logits = logits
50
+ flat_labels = labels.view(-1)
51
+
52
+ loss = self.loss_func(flat_logits, flat_labels)
53
+ return {'logits': logits, 'loss': loss}
54
+ else:
55
+ return {'logits': logits}
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7bc9bbb9567385a0e5d334d1c2391ac80b77becda3be7e947aa9d50edc607306
3
+ size 682759533