Patsorn commited on
Commit
1858f7e
·
1 Parent(s): d91fc09

update code

Browse files
code/clip/bpe_simple_vocab_16e6.txt.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a
3
+ size 1356917
code/clip/clip.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Code ported from https://github.com/openai/CLIP
2
+
3
+ import hashlib
4
+ import os
5
+ import urllib
6
+ import warnings
7
+ from typing import Union, List
8
+
9
+ import torch
10
+ from PIL import Image
11
+ from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize, RandomResizedCrop, RandomAffine
12
+ from tqdm import tqdm
13
+
14
+ from clip.model import build_model
15
+ from clip.tokenizer import SimpleTokenizer as _Tokenizer
16
+
17
+ __all__ = ["available_models", "load", "tokenize"]
18
+ _tokenizer = _Tokenizer()
19
+
20
+ _MODELS = {
21
+ "RN50": "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt",
22
+ "RN101": "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt",
23
+ "RN50x4": "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt",
24
+ "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
25
+ "ViT-B/16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
26
+ }
27
+
28
+
29
+ def _download(url: str, root: str = os.path.expanduser("~/.cache/clip")):
30
+ os.makedirs(root, exist_ok=True)
31
+ filename = os.path.basename(url)
32
+
33
+ expected_sha256 = url.split("/")[-2]
34
+ download_target = os.path.join(root, filename)
35
+
36
+ if os.path.exists(download_target) and not os.path.isfile(download_target):
37
+ raise RuntimeError(f"{download_target} exists and is not a regular file")
38
+
39
+ if os.path.isfile(download_target):
40
+ if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256:
41
+ return download_target
42
+ else:
43
+ warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
44
+
45
+ with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
46
+ with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True) as loop:
47
+ while True:
48
+ buffer = source.read(8192)
49
+ if not buffer:
50
+ break
51
+
52
+ output.write(buffer)
53
+ loop.update(len(buffer))
54
+
55
+ if hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256:
56
+ raise RuntimeError(f"Model has been downloaded but the SHA256 checksum does not not match")
57
+
58
+ return download_target
59
+
60
+ def _convert_to_rgb(image):
61
+ return image.convert('RGB')
62
+
63
+ def _transform(n_px: int, is_train: bool, affine: bool = False):
64
+ normalize = Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
65
+ if is_train:
66
+ if affine:
67
+ return Compose([
68
+ RandomAffine(degrees=30,translate=(0.3,0.3),shear =[-30,30,-30,30], scale=(1,2), fill=255, interpolation=Image.BICUBIC),
69
+ RandomResizedCrop(n_px, scale=(0.8, 1.0), interpolation=Image.BICUBIC),
70
+ _convert_to_rgb,
71
+ ToTensor(),
72
+ normalize,
73
+ ])
74
+ else:
75
+ return Compose([
76
+ RandomResizedCrop(n_px, scale=(0.9, 1.0), interpolation=Image.BICUBIC),
77
+ _convert_to_rgb,
78
+ ToTensor(),
79
+ normalize,
80
+ ])
81
+ else:
82
+ return Compose([
83
+ Resize(n_px, interpolation=Image.BICUBIC),
84
+ CenterCrop(n_px),
85
+ _convert_to_rgb,
86
+ ToTensor(),
87
+ normalize,
88
+ ])
89
+
90
+
91
+
92
+ def available_models() -> List[str]:
93
+ """Returns the names of available CLIP models"""
94
+ return list(_MODELS.keys())
95
+
96
+
97
+ def load(name: str, device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu", jit=True, is_train=False, pretrained=True, weight_sharing=False, feature_fusion='avg', affine_transformation=False, num_class=None):
98
+ """Load a CLIP model
99
+ Parameters
100
+ ----------
101
+ name : str
102
+ A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
103
+ device : Union[str, torch.device]
104
+ The device to put the loaded model
105
+ jit : bool
106
+ Whether to load the optimized JIT model (default) or more hackable non-JIT model.
107
+ Returns
108
+ -------
109
+ model : torch.nn.Module
110
+ The CLIP model
111
+ preprocess : Callable[[PIL.Image], torch.Tensor]
112
+ A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
113
+ """
114
+ assert num_class is not None
115
+ if name in _MODELS:
116
+ model_path = _download(_MODELS[name])
117
+ elif os.path.isfile(name):
118
+ model_path = name
119
+ else:
120
+ raise RuntimeError(f"Model {name} not found; available models = {available_models()}")
121
+
122
+ try:
123
+ # loading JIT archive
124
+ model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval()
125
+ state_dict = None
126
+ except RuntimeError:
127
+ # loading saved state dict
128
+ if jit:
129
+ warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead")
130
+ jit = False
131
+ state_dict = torch.load(model_path, map_location="cpu")
132
+
133
+ if not jit:
134
+ try:
135
+ model = build_model(state_dict or model.state_dict(), weight_sharing, feature_fusion, num_class = num_class).to(device)
136
+ except KeyError:
137
+ sd = {k[7:]: v for k,v in state_dict["state_dict"].items()}
138
+ model = build_model(sd, weight_sharing, feature_fusion, num_class=num_class).to(device)
139
+
140
+ if str(device) == "cpu":
141
+ model.float()
142
+ return model, \
143
+ _transform(model.visual.input_resolution, is_train=True, affine=affine_transformation), \
144
+ _transform(model.visual.input_resolution, is_train=False)
145
+ #sanity check to make sure we are not loading up old version of networks directly
146
+ assert model.visual2 is not None
147
+ # patch the device names
148
+ device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
149
+ device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1]
150
+
151
+ def patch_device(module):
152
+ graphs = [module.graph] if hasattr(module, "graph") else []
153
+ if hasattr(module, "forward1"):
154
+ graphs.append(module.forward1.graph)
155
+
156
+ for graph in graphs:
157
+ for node in graph.findAllNodes("prim::Constant"):
158
+ if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"):
159
+ node.copyAttributes(device_node)
160
+
161
+ #load sketch branch
162
+ weight_sharing = model.weight_sharing
163
+ if weight_sharing:
164
+ model.visual2 = model.visual
165
+ else:
166
+ #copy weight from image branch
167
+ sd1 = model.visual.state_dict()
168
+ sd2 = model.visual2.state_dict()
169
+ for name, param in sd1.items():
170
+ assert name in sd2
171
+ sd2[name].copy_(param)
172
+
173
+ model.apply(patch_device)
174
+ patch_device(model.encode_image)
175
+ patch_device(model.encode_text)
176
+
177
+ # patch dtype to float32 on CPU
178
+ if str(device) == "cpu":
179
+ float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[])
180
+ float_input = list(float_holder.graph.findNode("aten::to").inputs())[1]
181
+ float_node = float_input.node()
182
+
183
+ def patch_float(module):
184
+ graphs = [module.graph] if hasattr(module, "graph") else []
185
+ if hasattr(module, "forward1"):
186
+ graphs.append(module.forward1.graph)
187
+
188
+ for graph in graphs:
189
+ for node in graph.findAllNodes("aten::to"):
190
+ inputs = list(node.inputs())
191
+ for i in [1, 2]: # dtype can be the second or third argument to aten::to()
192
+ if inputs[i].node()["value"] == 5:
193
+ inputs[i].node().copyAttributes(float_node)
194
+
195
+ model.apply(patch_float)
196
+ patch_float(model.encode_image)
197
+ patch_float(model.encode_text)
198
+
199
+ model.float()
200
+
201
+ return model, \
202
+ _transform(model.input_resolution.item(), is_train=True, affine=affine_transformation), \
203
+ _transform(model.input_resolution.item(), is_train=False)
204
+
205
+
206
+ def tokenize(texts: Union[str, List[str]], context_length: int = 77) -> torch.LongTensor:
207
+ """
208
+ Returns the tokenized representation of given input string(s)
209
+ Parameters
210
+ ----------
211
+ texts : Union[str, List[str]]
212
+ An input string or a list of input strings to tokenize
213
+ context_length : int
214
+ The context length to use; all CLIP models use 77 as the context length
215
+ Returns
216
+ -------
217
+ A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
218
+ """
219
+ if isinstance(texts, str):
220
+ texts = [texts]
221
+
222
+ sot_token = _tokenizer.encoder["<start_of_text>"]
223
+ eot_token = _tokenizer.encoder["<end_of_text>"]
224
+ all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts]
225
+ result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
226
+
227
+ for i, tokens in enumerate(all_tokens):
228
+ if len(tokens) > context_length: # Truncate
229
+ tokens = tokens[:context_length]
230
+ result[i, :len(tokens)] = torch.tensor(tokens)
231
+
232
+ return result
code/clip/model.py ADDED
@@ -0,0 +1,506 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import OrderedDict
2
+ from typing import Tuple, Union
3
+
4
+ import os
5
+ import json
6
+ import numpy as np
7
+ import torch
8
+ import torch.nn.functional as F
9
+ from torch import nn
10
+
11
+
12
+ class Bottleneck(nn.Module):
13
+ expansion = 4
14
+
15
+ def __init__(self, inplanes, planes, stride=1):
16
+ super().__init__()
17
+
18
+ # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
19
+ self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
20
+ self.bn1 = nn.BatchNorm2d(planes)
21
+
22
+ self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
23
+ self.bn2 = nn.BatchNorm2d(planes)
24
+
25
+ self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
26
+
27
+ self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
28
+ self.bn3 = nn.BatchNorm2d(planes * self.expansion)
29
+
30
+ self.relu = nn.ReLU(inplace=True)
31
+ self.downsample = None
32
+ self.stride = stride
33
+
34
+ if stride > 1 or inplanes != planes * Bottleneck.expansion:
35
+ # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
36
+ self.downsample = nn.Sequential(OrderedDict([
37
+ ("-1", nn.AvgPool2d(stride)),
38
+ ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)),
39
+ ("1", nn.BatchNorm2d(planes * self.expansion))
40
+ ]))
41
+
42
+ def forward(self, x: torch.Tensor):
43
+ identity = x
44
+
45
+ out = self.relu(self.bn1(self.conv1(x)))
46
+ out = self.relu(self.bn2(self.conv2(out)))
47
+ out = self.avgpool(out)
48
+ out = self.bn3(self.conv3(out))
49
+
50
+ if self.downsample is not None:
51
+ identity = self.downsample(x)
52
+
53
+ out += identity
54
+ out = self.relu(out)
55
+ return out
56
+
57
+
58
+ class AttentionPool2d(nn.Module):
59
+ def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
60
+ super().__init__()
61
+ self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5)
62
+ self.k_proj = nn.Linear(embed_dim, embed_dim)
63
+ self.q_proj = nn.Linear(embed_dim, embed_dim)
64
+ self.v_proj = nn.Linear(embed_dim, embed_dim)
65
+ self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
66
+ self.num_heads = num_heads
67
+
68
+ def forward(self, x):
69
+ x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute(2, 0, 1) # NCHW -> (HW)NC
70
+ x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0) # (HW+1)NC
71
+ x = x + self.positional_embedding[:, None, :].to(x.dtype) # (HW+1)NC
72
+ x, _ = F.multi_head_attention_forward(
73
+ query=x, key=x, value=x,
74
+ embed_dim_to_check=x.shape[-1],
75
+ num_heads=self.num_heads,
76
+ q_proj_weight=self.q_proj.weight,
77
+ k_proj_weight=self.k_proj.weight,
78
+ v_proj_weight=self.v_proj.weight,
79
+ in_proj_weight=None,
80
+ in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
81
+ bias_k=None,
82
+ bias_v=None,
83
+ add_zero_attn=False,
84
+ dropout_p=0,
85
+ out_proj_weight=self.c_proj.weight,
86
+ out_proj_bias=self.c_proj.bias,
87
+ use_separate_proj_weight=True,
88
+ training=self.training,
89
+ need_weights=False
90
+ )
91
+
92
+ return x[0]
93
+
94
+
95
+ class ModifiedResNet(nn.Module):
96
+ """
97
+ A ResNet class that is similar to torchvision's but contains the following changes:
98
+ - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
99
+ - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
100
+ - The final pooling layer is a QKV attention instead of an average pool
101
+ """
102
+
103
+ def __init__(self, layers, output_dim, heads, input_resolution=224, width=64):
104
+ super().__init__()
105
+ self.output_dim = output_dim
106
+ self.input_resolution = input_resolution
107
+
108
+ # the 3-layer stem
109
+ self.conv1 = nn.Conv2d(3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
110
+ self.bn1 = nn.BatchNorm2d(width // 2)
111
+ self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False)
112
+ self.bn2 = nn.BatchNorm2d(width // 2)
113
+ self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
114
+ self.bn3 = nn.BatchNorm2d(width)
115
+ self.avgpool = nn.AvgPool2d(2)
116
+ self.relu = nn.ReLU(inplace=True)
117
+
118
+ # residual layers
119
+ self._inplanes = width # this is a *mutable* variable used during construction
120
+ self.layer1 = self._make_layer(width, layers[0])
121
+ self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
122
+ self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
123
+ self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
124
+
125
+ embed_dim = width * 32 # the ResNet feature dimension
126
+ self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, heads, output_dim)
127
+
128
+ def _make_layer(self, planes, blocks, stride=1):
129
+ layers = [Bottleneck(self._inplanes, planes, stride)]
130
+
131
+ self._inplanes = planes * Bottleneck.expansion
132
+ for _ in range(1, blocks):
133
+ layers.append(Bottleneck(self._inplanes, planes))
134
+
135
+ return nn.Sequential(*layers)
136
+
137
+ def forward(self, x):
138
+ def stem(x):
139
+ for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2), (self.conv3, self.bn3)]:
140
+ x = self.relu(bn(conv(x)))
141
+ x = self.avgpool(x)
142
+ return x
143
+
144
+ x = x.type(self.conv1.weight.dtype)
145
+ x = stem(x)
146
+ x = self.layer1(x)
147
+ x = self.layer2(x)
148
+ x = self.layer3(x)
149
+ x = self.layer4(x)
150
+ x = self.attnpool(x)
151
+
152
+ return x
153
+
154
+
155
+ class LayerNorm(nn.LayerNorm):
156
+ """Subclass torch's LayerNorm to handle fp16."""
157
+
158
+ def forward(self, x: torch.Tensor):
159
+ orig_type = x.dtype
160
+ ret = super().forward(x.type(torch.float32))
161
+ return ret.type(orig_type)
162
+
163
+
164
+ class QuickGELU(nn.Module):
165
+ def forward(self, x: torch.Tensor):
166
+ return x * torch.sigmoid(1.702 * x)
167
+
168
+
169
+ class ResidualAttentionBlock(nn.Module):
170
+ def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
171
+ super().__init__()
172
+
173
+ self.attn = nn.MultiheadAttention(d_model, n_head)
174
+ self.ln_1 = LayerNorm(d_model)
175
+ self.mlp = nn.Sequential(OrderedDict([
176
+ ("c_fc", nn.Linear(d_model, d_model * 4)),
177
+ ("gelu", QuickGELU()),
178
+ ("c_proj", nn.Linear(d_model * 4, d_model))
179
+ ]))
180
+ self.ln_2 = LayerNorm(d_model)
181
+ self.attn_mask = attn_mask
182
+
183
+ def attention(self, x: torch.Tensor):
184
+ self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
185
+ return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
186
+
187
+ def forward(self, x: torch.Tensor):
188
+ x = x + self.attention(self.ln_1(x))
189
+ x = x + self.mlp(self.ln_2(x))
190
+ return x
191
+
192
+
193
+ class Transformer(nn.Module):
194
+ def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None):
195
+ super().__init__()
196
+ self.width = width
197
+ self.layers = layers
198
+ self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)])
199
+
200
+ def forward(self, x: torch.Tensor):
201
+ return self.resblocks(x)
202
+
203
+
204
+ class VisualTransformer(nn.Module):
205
+ def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, output_dim: int):
206
+ super().__init__()
207
+ self.input_resolution = input_resolution
208
+ self.output_dim = output_dim
209
+ self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
210
+
211
+ scale = width ** -0.5
212
+ self.class_embedding = nn.Parameter(scale * torch.randn(width))
213
+ self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width))
214
+ self.ln_pre = LayerNorm(width)
215
+
216
+ self.transformer = Transformer(width, layers, heads)
217
+
218
+ self.ln_post = LayerNorm(width)
219
+ self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
220
+
221
+ def forward(self, x: torch.Tensor):
222
+ x = self.conv1(x) # shape = [*, width, grid, grid]
223
+ x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2]
224
+ x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width]
225
+ x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1) # shape = [*, grid ** 2 + 1, width]
226
+ x = x + self.positional_embedding.to(x.dtype)
227
+ x = self.ln_pre(x)
228
+
229
+ x = x.permute(1, 0, 2) # NLD -> LND
230
+ x = self.transformer(x)
231
+ x = x.permute(1, 0, 2) # LND -> NLD
232
+
233
+ x = self.ln_post(x[:, 0, :])
234
+
235
+ if self.proj is not None:
236
+ x = x @ self.proj
237
+
238
+ return x
239
+
240
+
241
+ from x_transformers.autoregressive_wrapper import AutoregressiveWrapper
242
+ from x_transformers import ViTransformerWrapper, TransformerWrapper, Encoder, Decoder
243
+
244
+ class CLIP(nn.Module):
245
+ def __init__(self,
246
+ embed_dim: int,
247
+ # vision
248
+ image_resolution: int,
249
+ vision_layers: Union[Tuple[int, int, int, int], int],
250
+ vision_width: int,
251
+ vision_patch_size: int,
252
+ # text
253
+ context_length: int,
254
+ vocab_size: int,
255
+ transformer_width: int,
256
+ transformer_heads: int,
257
+ transformer_layers: int,
258
+
259
+ weight_sharing: bool = False,
260
+ feature_fusion: str = 'avg',
261
+ num_class: int = 90
262
+ ):
263
+ super().__init__()
264
+ #set default to weight sharing
265
+ if weight_sharing is None:
266
+ weight_sharing = False
267
+
268
+ self.weight_sharing = weight_sharing
269
+ self.feature_fusion = feature_fusion
270
+ self.context_length = context_length
271
+
272
+ if isinstance(vision_layers, (tuple, list)):
273
+ vision_heads = vision_width * 32 // 64
274
+ self.visual = ModifiedResNet(
275
+ layers=vision_layers,
276
+ output_dim=embed_dim,
277
+ heads=vision_heads,
278
+ input_resolution=image_resolution,
279
+ width=vision_width
280
+ )
281
+ if weight_sharing:
282
+ self.visual2 = self.visual
283
+ else:
284
+ self.visual2= ModifiedResNet(
285
+ layers=vision_layers,
286
+ output_dim=embed_dim,
287
+ heads=vision_heads,
288
+ input_resolution=image_resolution,
289
+ width=vision_width
290
+ )
291
+ else:
292
+ vision_heads = vision_width // 64
293
+ self.visual = VisualTransformer(
294
+ input_resolution=image_resolution,
295
+ patch_size=vision_patch_size,
296
+ width=vision_width,
297
+ layers=vision_layers,
298
+ heads=vision_heads,
299
+ output_dim=embed_dim
300
+ )
301
+ if weight_sharing:
302
+ self.visual2 = self.visual
303
+ else:
304
+ self.visual2 = VisualTransformer(
305
+ input_resolution=image_resolution,
306
+ patch_size=vision_patch_size,
307
+ width=vision_width,
308
+ layers=vision_layers,
309
+ heads=vision_heads,
310
+ output_dim=embed_dim
311
+ )
312
+
313
+ self.transformer = Transformer(
314
+ width=transformer_width,
315
+ layers=transformer_layers,
316
+ heads=transformer_heads,
317
+ attn_mask=self.build_attention_mask()
318
+ )
319
+
320
+ self.vocab_size = vocab_size
321
+ self.token_embedding = nn.Embedding(vocab_size, transformer_width)
322
+ self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width))
323
+ self.ln_final = LayerNorm(transformer_width)
324
+
325
+ self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim))
326
+
327
+
328
+ self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
329
+
330
+ self.initialize_parameters()
331
+
332
+ def initialize_parameters(self):
333
+ nn.init.normal_(self.token_embedding.weight, std=0.02)
334
+ nn.init.normal_(self.positional_embedding, std=0.01)
335
+ self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
336
+
337
+
338
+ if isinstance(self.visual, ModifiedResNet):
339
+ if self.visual.attnpool is not None:
340
+ std = self.visual.attnpool.c_proj.in_features ** -0.5
341
+ nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
342
+ nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
343
+ nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
344
+ nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
345
+
346
+ if not self.weight_sharing:
347
+ nn.init.normal_(self.visual2.attnpool.q_proj.weight, std=std)
348
+ nn.init.normal_(self.visual2.attnpool.k_proj.weight, std=std)
349
+ nn.init.normal_(self.visual2.attnpool.v_proj.weight, std=std)
350
+ nn.init.normal_(self.visual2.attnpool.c_proj.weight, std=std)
351
+
352
+ for resnet_block in [self.visual.layer1, self.visual.layer2, self.visual.layer3, self.visual.layer4]:
353
+ for name, param in resnet_block.named_parameters():
354
+ if name.endswith("bn3.weight"):
355
+ nn.init.zeros_(param)
356
+ if not self.weight_sharing:
357
+ for resnet_block in [self.visual2.layer1, self.visual2.layer2, self.visual2.layer3, self.visual2.layer4]:
358
+ for name, param in resnet_block.named_parameters():
359
+ if name.endswith("bn3.weight"):
360
+ nn.init.zeros_(param)
361
+
362
+ proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
363
+ attn_std = self.transformer.width ** -0.5
364
+ fc_std = (2 * self.transformer.width) ** -0.5
365
+ for block in self.transformer.resblocks:
366
+ nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
367
+ nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
368
+ nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
369
+ nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
370
+
371
+ if self.text_projection is not None:
372
+ nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
373
+
374
+ def build_attention_mask(self):
375
+ # lazily create causal attention mask, with full attention between the vision tokens
376
+ # pytorch uses additive attention mask; fill with -inf
377
+ mask = torch.empty(self.context_length, self.context_length)
378
+ mask.fill_(float("-inf"))
379
+ mask.triu_(1) # zero out the lower diagonal
380
+ return mask
381
+
382
+ @property
383
+ def dtype(self):
384
+ return self.visual.conv1.weight.dtype
385
+ def decode(self,caption, encode):
386
+ return self.decoder(caption,context=encode)
387
+ def encode_image(self, image):
388
+ return self.visual(image.type(self.dtype))
389
+ def encode_sketch(self, image):
390
+ return self.visual2(image.type(self.dtype))
391
+
392
+ def encode_text(self, text):
393
+ x = self.token_embedding(text).type(self.dtype) # [batch_size, n_ctx, d_model]
394
+
395
+ x = x + self.positional_embedding.type(self.dtype)
396
+ x = x.permute(1, 0, 2) # NLD -> LND
397
+ x = self.transformer(x)
398
+ x = x.permute(1, 0, 2) # LND -> NLD
399
+ x = self.ln_final(x).type(self.dtype)
400
+
401
+ # x.shape = [batch_size, n_ctx, transformer.width]
402
+ # take features from the eot embedding (eot_token is the highest number in each sequence)
403
+ x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
404
+
405
+ return x
406
+ def freeze_nonfc(self):
407
+ for name, param in self.named_parameters():
408
+ if 'classification' not in name:# and 'visual' not in name:
409
+ param.requires_grad = False
410
+ else:
411
+ param.requires_grad = True
412
+
413
+ return
414
+ def unfreeze_nonfc(self):
415
+ for name, param in self.named_parameters():
416
+ if 'classification' not in name:# and 'visual' not in name:
417
+ param.requires_grad = True
418
+
419
+ return
420
+ def forward(self, image, text, sketch):
421
+
422
+ image_features = self.encode_image(image)
423
+ text_features = self.encode_text(text)
424
+ sketch_features = self.encode_sketch(sketch)
425
+
426
+ image_features = image_features / image_features.norm(dim=-1, keepdim=True)
427
+ text_features = text_features / text_features.norm(dim=-1, keepdim=True)
428
+ sketch_features = sketch_features / sketch_features.norm(dim=-1, keepdim=True)
429
+
430
+ fused_feature = self.feature_fuse(text_features,sketch_features)
431
+
432
+ return image_features, fused_feature
433
+
434
+ def feature_fuse(self, text_features, sketch_features):
435
+ #mode = avg|max
436
+ if self.feature_fusion == 'avg':
437
+ fused_features = (text_features + sketch_features)/2
438
+ else:
439
+ raise Exception(f'Mode {self.feature_fusion} not yet supported')
440
+ return fused_features
441
+
442
+ def convert_weights(model: nn.Module):
443
+ """Convert applicable model parameters to fp16"""
444
+
445
+ def _convert_weights_to_fp16(l):
446
+ if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
447
+ l.weight.data = l.weight.data.half()
448
+ if l.bias is not None:
449
+ l.bias.data = l.bias.data.half()
450
+
451
+ if isinstance(l, nn.MultiheadAttention):
452
+ for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
453
+ tensor = getattr(l, attr)
454
+ if tensor is not None:
455
+ tensor.data = tensor.data.half()
456
+
457
+ for name in ["text_projection", "proj"]:
458
+ if hasattr(l, name):
459
+ attr = getattr(l, name)
460
+ if attr is not None:
461
+ attr.data = attr.data.half()
462
+
463
+ model.apply(_convert_weights_to_fp16)
464
+
465
+
466
+ def build_model(state_dict: dict, weight_sharing: bool, feature_fusion: str, num_class: int):
467
+ vit = "visual.proj" in state_dict
468
+
469
+ if vit:
470
+ vision_width = state_dict["visual.conv1.weight"].shape[0]
471
+ vision_layers = len([k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")])
472
+ vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
473
+ grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5)
474
+ image_resolution = vision_patch_size * grid_size
475
+ else:
476
+ counts: list = [len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in [1, 2, 3, 4]]
477
+ vision_layers = tuple(counts)
478
+ vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
479
+ output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5)
480
+ vision_patch_size = None
481
+ assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0]
482
+ image_resolution = output_width * 32
483
+
484
+ embed_dim = state_dict["text_projection"].shape[1]
485
+ context_length = state_dict["positional_embedding"].shape[0]
486
+ vocab_size = state_dict["token_embedding.weight"].shape[0]
487
+ transformer_width = state_dict["ln_final.weight"].shape[0]
488
+ transformer_heads = transformer_width // 64
489
+ transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith(f"transformer.resblocks")))
490
+
491
+ model = CLIP(
492
+ embed_dim,
493
+ image_resolution, vision_layers, vision_width, vision_patch_size,
494
+ context_length, vocab_size, transformer_width, transformer_heads, transformer_layers,
495
+ weight_sharing, feature_fusion,
496
+ num_class=num_class
497
+ )
498
+
499
+ for key in ["input_resolution", "context_length", "vocab_size"]:
500
+ if key in state_dict:
501
+ del state_dict[key]
502
+
503
+ convert_weights(model)
504
+ #TODO: only do strict=false when loading from state with 'visual2' branch
505
+ model.load_state_dict(state_dict, strict=False)
506
+ return model.eval()
code/clip/tokenizer.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gzip
2
+ import html
3
+ import os
4
+ from functools import lru_cache
5
+
6
+ import ftfy
7
+ import regex as re
8
+
9
+
10
+ @lru_cache()
11
+ def default_bpe():
12
+ return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
13
+
14
+
15
+ @lru_cache()
16
+ def bytes_to_unicode():
17
+ """
18
+ Returns list of utf-8 byte and a corresponding list of unicode strings.
19
+ The reversible bpe codes work on unicode strings.
20
+ This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
21
+ When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
22
+ This is a signficant percentage of your normal, say, 32K bpe vocab.
23
+ To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
24
+ And avoids mapping to whitespace/control characters the bpe code barfs on.
25
+ """
26
+ bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
27
+ cs = bs[:]
28
+ n = 0
29
+ for b in range(2**8):
30
+ if b not in bs:
31
+ bs.append(b)
32
+ cs.append(2**8+n)
33
+ n += 1
34
+ cs = [chr(n) for n in cs]
35
+ return dict(zip(bs, cs))
36
+
37
+
38
+ def get_pairs(word):
39
+ """Return set of symbol pairs in a word.
40
+ Word is represented as tuple of symbols (symbols being variable-length strings).
41
+ """
42
+ pairs = set()
43
+ prev_char = word[0]
44
+ for char in word[1:]:
45
+ pairs.add((prev_char, char))
46
+ prev_char = char
47
+ return pairs
48
+
49
+
50
+ def basic_clean(text):
51
+ text = ftfy.fix_text(text)
52
+ text = html.unescape(html.unescape(text))
53
+ return text.strip()
54
+
55
+
56
+ def whitespace_clean(text):
57
+ text = re.sub(r'\s+', ' ', text)
58
+ text = text.strip()
59
+ return text
60
+
61
+
62
+ class SimpleTokenizer(object):
63
+ def __init__(self, bpe_path: str = default_bpe(), special_tokens=None):
64
+ self.byte_encoder = bytes_to_unicode()
65
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
66
+ merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
67
+ merges = merges[1:49152-256-2+1]
68
+ merges = [tuple(merge.split()) for merge in merges]
69
+ vocab = list(bytes_to_unicode().values())
70
+ vocab = vocab + [v+'</w>' for v in vocab]
71
+ for merge in merges:
72
+ vocab.append(''.join(merge))
73
+ if not special_tokens:
74
+ special_tokens = ['<start_of_text>', '<end_of_text>']
75
+ else:
76
+ special_tokens = ['<start_of_text>', '<end_of_text>'] + special_tokens
77
+ vocab.extend(special_tokens)
78
+ self.encoder = dict(zip(vocab, range(len(vocab))))
79
+ self.decoder = {v: k for k, v in self.encoder.items()}
80
+ self.bpe_ranks = dict(zip(merges, range(len(merges))))
81
+ self.cache = {t:t for t in special_tokens}
82
+ special = "|".join(special_tokens)
83
+ self.pat = re.compile(special + r"""|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
84
+
85
+ self.vocab_size = len(self.encoder)
86
+ self.all_special_ids = [self.encoder[t] for t in special_tokens]
87
+
88
+ def bpe(self, token):
89
+ if token in self.cache:
90
+ return self.cache[token]
91
+ word = tuple(token[:-1]) + ( token[-1] + '</w>',)
92
+ pairs = get_pairs(word)
93
+
94
+ if not pairs:
95
+ return token+'</w>'
96
+
97
+ while True:
98
+ bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
99
+ if bigram not in self.bpe_ranks:
100
+ break
101
+ first, second = bigram
102
+ new_word = []
103
+ i = 0
104
+ while i < len(word):
105
+ try:
106
+ j = word.index(first, i)
107
+ new_word.extend(word[i:j])
108
+ i = j
109
+ except:
110
+ new_word.extend(word[i:])
111
+ break
112
+
113
+ if word[i] == first and i < len(word)-1 and word[i+1] == second:
114
+ new_word.append(first+second)
115
+ i += 2
116
+ else:
117
+ new_word.append(word[i])
118
+ i += 1
119
+ new_word = tuple(new_word)
120
+ word = new_word
121
+ if len(word) == 1:
122
+ break
123
+ else:
124
+ pairs = get_pairs(word)
125
+ word = ' '.join(word)
126
+ self.cache[token] = word
127
+ return word
128
+
129
+ def encode(self, text):
130
+ bpe_tokens = []
131
+ text = whitespace_clean(basic_clean(text)).lower()
132
+ for token in re.findall(self.pat, text):
133
+ token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
134
+ bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
135
+ return bpe_tokens
136
+
137
+ def decode(self, tokens):
138
+ text = ''.join([self.decoder[token] for token in tokens])
139
+ text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
140
+ return text
code/training/model_configs/ViT-B-16.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 512,
3
+ "image_resolution": 224,
4
+ "vision_layers": 12,
5
+ "vision_width": 768,
6
+ "vision_patch_size": 16,
7
+ "context_length": 77,
8
+ "vocab_size": 49408,
9
+ "transformer_width": 512,
10
+ "transformer_heads": 8,
11
+ "transformer_layers": 12
12
+ }