tensorgirl
commited on
Commit
·
f203678
1
Parent(s):
8522232
Upload AugViTForImageClassification
Browse files- README.md +46 -0
- augvit_config.py +31 -0
- augvit_model.py +178 -0
- config.json +20 -0
- tf_model.h5 +3 -0
README.md
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
tags:
|
3 |
+
- generated_from_keras_callback
|
4 |
+
model-index:
|
5 |
+
- name: TFaugvit
|
6 |
+
results: []
|
7 |
+
---
|
8 |
+
|
9 |
+
<!-- This model card has been generated automatically according to the information Keras had access to. You should
|
10 |
+
probably proofread and complete it, then remove this comment. -->
|
11 |
+
|
12 |
+
# TFaugvit
|
13 |
+
|
14 |
+
This model is a fine-tuned version of [](https://huggingface.co/) on an unknown dataset.
|
15 |
+
It achieves the following results on the evaluation set:
|
16 |
+
|
17 |
+
|
18 |
+
## Model description
|
19 |
+
|
20 |
+
More information needed
|
21 |
+
|
22 |
+
## Intended uses & limitations
|
23 |
+
|
24 |
+
More information needed
|
25 |
+
|
26 |
+
## Training and evaluation data
|
27 |
+
|
28 |
+
More information needed
|
29 |
+
|
30 |
+
## Training procedure
|
31 |
+
|
32 |
+
### Training hyperparameters
|
33 |
+
|
34 |
+
The following hyperparameters were used during training:
|
35 |
+
- optimizer: None
|
36 |
+
- training_precision: float32
|
37 |
+
|
38 |
+
### Training results
|
39 |
+
|
40 |
+
|
41 |
+
|
42 |
+
### Framework versions
|
43 |
+
|
44 |
+
- Transformers 4.33.2
|
45 |
+
- TensorFlow 2.13.0
|
46 |
+
- Tokenizers 0.13.3
|
augvit_config.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import PretrainedConfig
|
2 |
+
from typing import List
|
3 |
+
|
4 |
+
|
5 |
+
class AugViTConfig(PretrainedConfig):
|
6 |
+
model_type = "augvit"
|
7 |
+
|
8 |
+
def __init__(
|
9 |
+
self,
|
10 |
+
image_size: int = 32,
|
11 |
+
patch_size: int = 4,
|
12 |
+
num_classes: int = 10,
|
13 |
+
dim: int = 128,
|
14 |
+
depth: int = 6,
|
15 |
+
heads: int = 16,
|
16 |
+
mlp_dim: int = 256,
|
17 |
+
dropout: int = 0.1,
|
18 |
+
emb_dropout: int = 0.1,
|
19 |
+
**kwargs,
|
20 |
+
):
|
21 |
+
|
22 |
+
self.image_size = image_size
|
23 |
+
self.patch_size = patch_size
|
24 |
+
self.num_classes = num_classes
|
25 |
+
self.dim = dim
|
26 |
+
self.depth = depth
|
27 |
+
self.heads = heads
|
28 |
+
self.mlp_dim = mlp_dim
|
29 |
+
self.dropout = dropout
|
30 |
+
self.emb_dropout = emb_dropout
|
31 |
+
super().__init__(**kwargs)
|
augvit_model.py
ADDED
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tensorflow as tf
|
2 |
+
from tensorflow.keras import Model
|
3 |
+
from tensorflow.keras.layers import Layer
|
4 |
+
from tensorflow.keras import Sequential
|
5 |
+
import tensorflow.keras.layers as nn
|
6 |
+
|
7 |
+
from tensorflow import einsum
|
8 |
+
from einops import rearrange, repeat
|
9 |
+
from einops.layers.tensorflow import Rearrange
|
10 |
+
import numpy as np
|
11 |
+
|
12 |
+
|
13 |
+
def pair(t):
|
14 |
+
return t if isinstance(t, tuple) else (t, t)
|
15 |
+
def gelu(x):
|
16 |
+
|
17 |
+
cdf = 0.5 * (1.0 + tf.tanh(
|
18 |
+
(np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
|
19 |
+
return x * cdf
|
20 |
+
|
21 |
+
class PreNorm(Layer):
|
22 |
+
def __init__(self,fn,name):
|
23 |
+
super(PreNorm, self).__init__(name=name)
|
24 |
+
self.norm = nn.LayerNormalization(name=f'{name}/layernorm')
|
25 |
+
self.fn = fn
|
26 |
+
|
27 |
+
def call(self, x, training=True):
|
28 |
+
return self.fn(self.norm(x), training=training)
|
29 |
+
|
30 |
+
|
31 |
+
class MLP(Layer):
|
32 |
+
def __init__(self, dim, hidden_dim, name,dropout=0.0):
|
33 |
+
super(MLP, self).__init__(name=name)
|
34 |
+
self.net = Sequential([
|
35 |
+
nn.Dense(units=hidden_dim,activation=gelu,name=f'{name}/den1'),
|
36 |
+
|
37 |
+
nn.Dropout(rate=dropout,name=f'{name}/drop1'),
|
38 |
+
nn.Dense(units=dim,name=f'{name}/den2'),
|
39 |
+
nn.Dropout(rate=dropout,name=f'{name}/drop2')
|
40 |
+
],name=f'{name}/seq1')
|
41 |
+
|
42 |
+
def call(self, x, training=True):
|
43 |
+
return self.net(x, training=training)
|
44 |
+
|
45 |
+
class Attention(Layer):
|
46 |
+
def __init__(self, dim, name,heads=8, dim_head=64, dropout=0.0):
|
47 |
+
super(Attention, self).__init__(name=name)
|
48 |
+
inner_dim = dim_head * heads
|
49 |
+
project_out = not (heads == 1 and dim_head == dim)
|
50 |
+
self.heads = heads
|
51 |
+
self.scale = dim_head ** -0.5
|
52 |
+
|
53 |
+
self.attend = nn.Softmax(name=f'{name}/soft')
|
54 |
+
self.to_qkv = nn.Dense(units=inner_dim * 3, use_bias=False,name=f'{name}/den1')
|
55 |
+
|
56 |
+
if project_out:
|
57 |
+
self.to_out = [
|
58 |
+
nn.Dense(units=dim,name=f'{name}/den2'),
|
59 |
+
nn.Dropout(rate=dropout,name=f'{name}/drop1')
|
60 |
+
]
|
61 |
+
else:
|
62 |
+
self.to_out = []
|
63 |
+
self.to_out = Sequential(self.to_out,name=f'{name}/seq')
|
64 |
+
|
65 |
+
def call(self, x, training=True):
|
66 |
+
qkv = self.to_qkv(x)
|
67 |
+
qkv = tf.split(qkv, num_or_size_splits=3, axis=-1)
|
68 |
+
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=self.heads), qkv)
|
69 |
+
|
70 |
+
# dots = tf.matmul(q, tf.transpose(k, perm=[0, 1, 3, 2])) * self.scale
|
71 |
+
dots = einsum('b h i d, b h j d -> b h i j', q, k) * self.scale
|
72 |
+
attn = self.attend(dots)
|
73 |
+
|
74 |
+
# x = tf.matmul(attn, v)
|
75 |
+
x = einsum('b h i j, b h j d -> b h i d', attn, v)
|
76 |
+
x = rearrange(x, 'b h n d -> b n (h d)')
|
77 |
+
x = self.to_out(x, training=training)
|
78 |
+
|
79 |
+
return x
|
80 |
+
|
81 |
+
class Transformer(Layer):
|
82 |
+
def __init__(self, dim, depth, heads, dim_head, mlp_dim, name,dropout=0.0):
|
83 |
+
super(Transformer, self).__init__(True,name)
|
84 |
+
|
85 |
+
self.layers = []
|
86 |
+
|
87 |
+
for i in range(depth):
|
88 |
+
self.layers.append([
|
89 |
+
PreNorm(Attention(dim, heads=heads, dim_head=dim_head, dropout=dropout,name=f'{name}/att{i}'),name=f'{name}preno{i}'),
|
90 |
+
PreNorm(nn.Dense(dim,activation=gelu,name=f'{name}/den{i}'),name=f'{name}preno1{i}'),
|
91 |
+
PreNorm(MLP(dim, mlp_dim, dropout=dropout,name=f'{name}/mlp{i}'),name=f'{name}preno2{i}'),
|
92 |
+
PreNorm(nn.Dense(dim,activation=gelu,name=f'{name}/den2{i}'),name=f'{name}preno3{i}'),
|
93 |
+
])
|
94 |
+
|
95 |
+
|
96 |
+
def call(self, x, training=True):
|
97 |
+
for attn,aug_attn, mlp, augs in self.layers:
|
98 |
+
x = attn(x, training=training) + x + aug_attn(x, training=training)
|
99 |
+
x = mlp(x, training=training) + x + augs(x, training=training)
|
100 |
+
return x
|
101 |
+
|
102 |
+
class AUGViT(Model):
|
103 |
+
def __init__(self, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim,name='augvit',
|
104 |
+
pool='cls', dim_head=64, dropout=0.0, emb_dropout=0.0):
|
105 |
+
|
106 |
+
super(AUGViT, self).__init__(name=name)
|
107 |
+
|
108 |
+
image_height, image_width = pair(image_size)
|
109 |
+
patch_height, patch_width = pair(patch_size)
|
110 |
+
|
111 |
+
assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'
|
112 |
+
|
113 |
+
num_patches = (image_height // patch_height) * (image_width // patch_width)
|
114 |
+
assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'
|
115 |
+
|
116 |
+
self.patch_embedding = Sequential([
|
117 |
+
Rearrange('b (h p1) (w p2) c -> b (h w) (p1 p2 c)', p1=patch_height, p2=patch_width),
|
118 |
+
nn.Dense(units=dim,name='patchden')
|
119 |
+
], name='patch_embedding')
|
120 |
+
|
121 |
+
self.pos_embedding = tf.Variable(initial_value=tf.random.normal([1, num_patches + 1, dim]),name='pos_emb')
|
122 |
+
self.cls_token = tf.Variable(initial_value=tf.random.normal([1, 1, dim]),name='cls')
|
123 |
+
self.dropout = nn.Dropout(rate=emb_dropout,name='drop')
|
124 |
+
|
125 |
+
self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout=dropout,name='trans')
|
126 |
+
|
127 |
+
self.pool = pool
|
128 |
+
|
129 |
+
self.mlp_head = Sequential([
|
130 |
+
nn.LayerNormalization(name='layernorm'),
|
131 |
+
nn.Dense(units=num_classes,name='dense12')
|
132 |
+
], name='mlp_head')
|
133 |
+
|
134 |
+
def call(self, img, training=True, **kwargs):
|
135 |
+
x = self.patch_embedding(img)
|
136 |
+
b, n, d = x.shape
|
137 |
+
|
138 |
+
cls_tokens = repeat(self.cls_token, '() n d -> b n d', b=b)
|
139 |
+
x = tf.concat([cls_tokens, x], axis=1)
|
140 |
+
x += self.pos_embedding[:, :(n + 1)]
|
141 |
+
x = self.dropout(x, training=training)
|
142 |
+
|
143 |
+
x = self.transformer(x, training=training)
|
144 |
+
|
145 |
+
if self.pool == 'mean':
|
146 |
+
x = tf.reduce_mean(x, axis=1)
|
147 |
+
else:
|
148 |
+
x = x[:, 0]
|
149 |
+
|
150 |
+
x = self.mlp_head(x)
|
151 |
+
|
152 |
+
return x
|
153 |
+
|
154 |
+
|
155 |
+
|
156 |
+
|
157 |
+
from transformers import TFPreTrainedModel
|
158 |
+
from .augvit_config import AugViTConfig
|
159 |
+
|
160 |
+
class AugViTForImageClassification(TFPreTrainedModel):
|
161 |
+
config_class = AugViTConfig
|
162 |
+
def __init__(self, config):
|
163 |
+
super().__init__(config)
|
164 |
+
self.model = AUGViT(
|
165 |
+
image_size = config.image_size,
|
166 |
+
patch_size = config.patch_size,
|
167 |
+
num_classes = config.num_classes,
|
168 |
+
dim = config.dim,
|
169 |
+
depth = config.depth,
|
170 |
+
heads = config.heads,
|
171 |
+
mlp_dim = config.mlp_dim,
|
172 |
+
dropout = config.dropout,
|
173 |
+
emb_dropout =config.emb_dropout
|
174 |
+
)
|
175 |
+
|
176 |
+
def call(self, input,**kwargs):
|
177 |
+
logits = self.model(input)
|
178 |
+
return logits
|
config.json
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"gViTForImageClassification"
|
4 |
+
],
|
5 |
+
"auto_map": {
|
6 |
+
"AutoConfig": "augvit_config.AugViTConfig",
|
7 |
+
"TFAutoModelForImageClassification": "augvit_model.AugViTForImageClassification"
|
8 |
+
},
|
9 |
+
"depth": 1,
|
10 |
+
"dim": 128,
|
11 |
+
"dropout": 0.1,
|
12 |
+
"emb_dropout": 0.1,
|
13 |
+
"heads": 16,
|
14 |
+
"image_size": 32,
|
15 |
+
"mlp_dim": 256,
|
16 |
+
"model_type": "augvit",
|
17 |
+
"num_classes": 10,
|
18 |
+
"patch_size": 4,
|
19 |
+
"transformers_version": "4.33.2"
|
20 |
+
}
|
tf_model.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2d4253d3d1d254a42edd6baf604ba35d0317f6754cec27fe47739ba60908c235
|
3 |
+
size 2613128
|