Spaces:

JMalott
/

ai_architecture

Runtime error

App Files Files

Jonathan Malott commited on Jul 18, 2022

Commit

44df93e

0 Parent(s):

initial

Browse files

Files changed (18) hide show

.gitignore +20 -0
Procfile +1 -0
dalle/__init__.py +0 -0
dalle/models/__init__.py +206 -0
dalle/models/stage1/layers.py +373 -0
dalle/models/stage1/vqgan.py +99 -0
dalle/models/stage2/layers.py +140 -0
dalle/models/stage2/transformer.py +257 -0
dalle/models/tokenizer.py +26 -0
dalle/utils/__init__.py +3 -0
dalle/utils/config.py +123 -0
dalle/utils/sampling.py +162 -0
dalle/utils/utils.py +84 -0
page/generate.py +97 -0
page/reduce.py +58 -0
requirements.txt +18 -0
streamlit_app.py +48 -0
utils.py +160 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,20 @@

+.ipynb_checkpoints/
+__pycache__/
+_archives/
+_exampleImages/
+_trash/
+minDALL-E/
+temp/

Procfile ADDED Viewed

	@@ -0,0 +1 @@


1	+ web: sh setup.sh && streamlit run streamlit_app.py

dalle/__init__.py ADDED Viewed

File without changes

dalle/models/__init__.py ADDED Viewed

	@@ -0,0 +1,206 @@

+# ------------------------------------------------------------------------------------
+# minDALL-E
+# Copyright (c) 2021 Kakao Brain Corp. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------
+import os
+import torch
+import torch.nn as nn
+import pytorch_lightning as pl
+from typing import Optional, Tuple
+from omegaconf import OmegaConf
+from torch.cuda.amp import autocast
+from torch.optim.lr_scheduler import CosineAnnealingLR
+from torch.nn import functional as F
+from .stage1.vqgan import VQGAN
+from .stage2.transformer import Transformer1d, iGPT
+from .. import utils
+from ..utils.config import get_base_config
+from ..utils.sampling import sampling, sampling_igpt
+from .tokenizer import build_tokenizer
+_MODELS = {
+    'minDALL-E/1.3B': 'https://arena.kakaocdn.net/brainrepo/models/minDALL-E/57b008f02ceaa02b779c8b7463143315/1.3B.tar.gz'
+}
+class Dalle(nn.Module):
+    def __init__(self,
+                 config: OmegaConf) -> None:
+        super().__init__()
+        self.tokenizer = None
+        self.stage1 = VQGAN(n_embed=config.stage1.n_embed,
+                            embed_dim=config.stage1.embed_dim,
+                            hparams=config.stage1.hparams)
+        self.stage2 = Transformer1d(vocab_size_txt=config.stage2.vocab_size_txt,
+                                    vocab_size_img=config.stage2.vocab_size_img,
+                                    hparams=config.stage2.hparams)
+        self.config_stage1 = config.stage1
+        self.config_stage2 = config.stage2
+        self.config_dataset = config.dataset
+    @classmethod
+    def from_pretrained(cls,
+                        path: str) -> nn.Module:
+        #path = _MODELS[path] if path in _MODELS else path
+        #path = utils.realpath_url_or_path(path, root=os.path.expanduser(".cache/minDALL-E"))
+        path = ''
+        config_base = get_base_config()
+        config_new = OmegaConf.load(os.path.join(path, '.cache/minDALL-E/1.3B/config.yaml'))
+        config_update = OmegaConf.merge(config_base, config_new)
+        model = cls(config_update)
+        model.tokenizer = build_tokenizer('.cache/minDALL-E/1.3B/tokenizer',
+                                          context_length=model.config_dataset.context_length,
+                                          lowercase=True,
+                                          dropout=None)
+        model.stage1.from_ckpt('.cache/minDALL-E/1.3B/stage1_last.ckpt')
+        model.stage2.from_ckpt('https://utexas.box.com/shared/static/54jc9fw0bious5nx6wvayeqaskcrdgv4.ckpt')
+        #model.stage1.from_ckpt('https://utexas.box.com/shared/static/rpt9miyj2kikogyekpqnkd6y115xp51i.ckpt')
+        #model.stage2.from_ckpt('https://utexas.box.com/shared/static/54jc9fw0bious5nx6wvayeqaskcrdgv4.ckpt')
+        return model
+    @torch.no_grad()
+    def sampling(self,
+                 prompt: str,
+                 top_k: int = 256,
+                 top_p: Optional[float] = None,
+                 softmax_temperature: float = 1.0,
+                 num_candidates: int = 96,
+                 device: str = 'cuda:0',
+                 use_fp16: bool = True) -> torch.FloatTensor:
+        self.stage1.eval()
+        self.stage2.eval()
+        tokens = self.tokenizer.encode(prompt)
+        tokens = torch.LongTensor(tokens.ids)
+        tokens = torch.repeat_interleave(tokens.unsqueeze(0), num_candidates, dim=0)
+        # Check if the encoding works as intended
+        # print(self.tokenizer.decode_batch(tokens.tolist(), skip_special_tokens=True)[0])
+        tokens = tokens.to(device)
+        codes = sampling(self.stage2,
+                         tokens,
+                         top_k=top_k,
+                         top_p=top_p,
+                         softmax_temperature=softmax_temperature,
+                         use_fp16=use_fp16)
+        codes = codes.view(num_candidates, 16, 16)  # [B, 16, 16]
+        pixels = torch.clamp(self.stage1.decode_code(codes) * 0.5 + 0.5, 0, 1)  # [B, 256, 256]
+        return pixels
+class ImageGPT(pl.LightningModule):
+    def __init__(self,
+                 config: OmegaConf) -> None:
+        super().__init__()
+        self.stage1 = VQGAN(n_embed=config.stage1.n_embed,
+                            embed_dim=config.stage1.embed_dim,
+                            hparams=config.stage1.hparams)
+        self.stage2 = iGPT(vocab_size_img=config.stage2.vocab_size_img,
+                           use_cls_cond=config.stage2.use_cls_cond,
+                           hparams=config.stage2.hparams)
+        self.config = config
+        self.use_cls_cond = config.stage2.use_cls_cond
+        # make the parameters in stage 1 not trainable
+        self.stage1.eval()
+        for p in self.stage1.parameters():
+            p.requires_grad = False
+    @classmethod
+    def from_pretrained(cls,
+                        path_upstream: str,
+                        path_downstream: str) -> Tuple[nn.Module, OmegaConf]:
+        config_base = get_base_config(use_default=False)
+        config_down = OmegaConf.load(path_downstream)
+        config_down = OmegaConf.merge(config_base, config_down)
+        model = cls(config_down)
+        model.stage1.from_ckpt(os.path.join(path_upstream, 'stage1_last.ckpt'), strict=True)
+        model.stage2.from_ckpt(os.path.join(path_upstream, 'stage2_last.ckpt'), strict=False)
+        return model, config_down
+    def sample(self,
+               cls_idx: Optional[int] = None,
+               top_k: int = 256,
+               top_p: Optional[float] = None,
+               softmax_temperature: float = 1.0,
+               num_candidates: int = 16,
+               device: str = 'cuda:0',
+               use_fp16: bool = True,
+               is_tqdm: bool = True) -> torch.FloatTensor:
+        self.stage1.eval()
+        self.stage2.eval()
+        if cls_idx is None:
+            sos = self.stage2.sos.repeat(num_candidates, 1, 1)
+        else:
+            sos = torch.LongTensor([cls_idx]).to(device=device)
+            sos = sos.repeat(num_candidates)
+            sos = self.stage2.sos(sos).unsqueeze(1)
+        codes = sampling_igpt(self.stage2,
+                              sos=sos,
+                              top_k=top_k,
+                              top_p=top_p,
+                              softmax_temperature=softmax_temperature,
+                              use_fp16=use_fp16,
+                              is_tqdm=is_tqdm)
+        codes = codes.view(num_candidates, 16, 16)  # [B, 16, 16]
+        pixels = torch.clamp(self.stage1.decode_code(codes) * 0.5 + 0.5, 0, 1)  # [B, 256, 256]
+        return pixels
+    def forward(self,
+                images: torch.FloatTensor,
+                labels: Optional[torch.LongTensor] = None) -> torch.FloatTensor:
+        B, C, H, W = images.shape
+        with torch.no_grad():
+            with autocast(enabled=False):
+                codes = self.stage1.get_codes(images).detach()
+        logits = self.stage2(codes, labels)
+        return logits, codes
+    def training_step(self, batch, batch_idx):
+        images, labels = batch
+        logits, codes = self(images, labels=labels if self.use_cls_cond else None)
+        loss = F.cross_entropy(logits.view(-1, logits.shape[-1]), codes.view(-1))
+        self.log("train/loss", loss, on_step=True, on_epoch=True, prog_bar=False, logger=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        images, labels = batch
+        logits, codes = self(images, labels=labels if self.use_cls_cond else None)
+        loss = F.cross_entropy(logits.view(-1, logits.shape[-1]), codes.view(-1))
+        self.log("val/loss", loss, on_step=False, on_epoch=True, prog_bar=False, logger=True)
+        return loss
+    def configure_optimizers(self):
+        assert self.config.optimizer.opt_type == 'adamW'
+        assert self.config.optimizer.sched_type == 'cosine'
+        opt = torch.optim.AdamW(self.parameters(),
+                                lr=self.config.optimizer.base_lr,
+                                betas=self.config.optimizer.betas,
+                                weight_decay=self.config.optimizer.weight_decay)
+        sched = CosineAnnealingLR(opt,
+                                  T_max=self.config.optimizer.max_steps,
+                                  eta_min=self.config.optimizer.min_lr)
+        sched = {
+            'scheduler': sched,
+            'name': 'cosine'
+        }
+        return [opt], [sched]
+    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure,
+                       on_tpu=False, using_native_amp=False, using_lbfgs=False):
+        optimizer.step(closure=optimizer_closure)
+        self.lr_schedulers().step()
+        self.log("lr", self.lr_schedulers().get_last_lr()[0], on_step=True, on_epoch=False, prog_bar=True, logger=True)
+    def on_epoch_start(self):
+        self.stage1.eval()

dalle/models/stage1/layers.py ADDED Viewed

	@@ -0,0 +1,373 @@

+# ------------------------------------------------------------------------------------
+# Modified from VQGAN (https://github.com/CompVis/taming-transformers)
+# Copyright (c) 2020 Patrick Esser and Robin Rombach and Björn Ommer. All Rights Reserved.
+# ------------------------------------------------------------------------------------
+import torch
+import torch.nn as nn
+from typing import Tuple, Optional
+def nonlinearity(x):
+    # swish
+    return x*torch.sigmoid(x)
+def Normalize(in_channels):
+    return torch.nn.GroupNorm(num_groups=32,
+                              num_channels=in_channels,
+                              eps=1e-6,
+                              affine=True)
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x):
+        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=0)
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0, 1, 0, 1)
+            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+class ResnetBlock(nn.Module):
+    def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
+                 dropout, temb_channels=512):
+        assert temb_channels == 0
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = torch.nn.Conv2d(in_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        self.norm2 = Normalize(out_channels)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(out_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(in_channels,
+                                                     out_channels,
+                                                     kernel_size=3,
+                                                     stride=1,
+                                                     padding=1)
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(in_channels,
+                                                    out_channels,
+                                                    kernel_size=1,
+                                                    stride=1,
+                                                    padding=0)
+    def forward(self, x, temb=None):
+        assert temb is None
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x+h
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.k = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.v = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.proj_out = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=1,
+                                        stride=1,
+                                        padding=0)
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b, c, h, w = q.shape
+        q = q.reshape(b, c, h*w)
+        q = q.permute(0, 2, 1)  # b,hw,c
+        k = k.reshape(b, c, h*w)  # b,c,hw
+        w_ = torch.bmm(q, k)  # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c)**(-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = v.reshape(b, c, h*w)
+        w_ = w_.permute(0, 2, 1)  # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v, w_)  # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b, c, h, w)
+        h_ = self.proj_out(h_)
+        return x+h_
+class Encoder(nn.Module):
+    def __init__(self,
+                 *,  # forced to use named arguments
+                 ch: int,
+                 out_ch: int,
+                 ch_mult: Tuple[int] = (1, 2, 4, 8),
+                 num_res_blocks: int,
+                 attn_resolutions: Tuple[int],
+                 pdrop: float = 0.0,
+                 resamp_with_conv: bool = True,
+                 in_channels: int,
+                 resolution: int,
+                 z_channels: int,
+                 double_z: Optional[bool] = None) -> None:
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(in_channels,
+                                       self.ch,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,)+tuple(ch_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch*in_ch_mult[i_level]
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=pdrop))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions-1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=pdrop)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=pdrop)
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        2*z_channels if double_z else z_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x):
+        assert x.shape[2] == x.shape[3] == self.resolution, \
+               "{}, {}".format(x.shape, self.resolution)
+        # downsampling
+        h = self.conv_in(x)
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](h)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+            if i_level != self.num_resolutions-1:
+                h = self.down[i_level].downsample(h)
+        # middle
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(self,
+                 *,  # forced to use named arguments
+                 ch: int,
+                 out_ch: int,
+                 ch_mult: Tuple[int] = (1, 2, 4, 8),
+                 num_res_blocks: int,
+                 attn_resolutions: Tuple[int],
+                 pdrop: float = 0.0,
+                 resamp_with_conv: bool = True,
+                 in_channels: int,
+                 resolution: int,
+                 z_channels: int,
+                 double_z: bool) -> None:
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        block_in = ch*ch_mult[self.num_resolutions-1]
+        curr_res = resolution // 2**(self.num_resolutions-1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(z_channels,
+                                       block_in,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=pdrop)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=pdrop)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks+1):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=pdrop))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        out_ch,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, z):
+        assert z.shape[1:] == self.z_shape[1:]
+        self.last_z_shape = z.shape
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks+1):
+                h = self.up[i_level].block[i_block](h)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h

dalle/models/stage1/vqgan.py ADDED Viewed

	@@ -0,0 +1,99 @@

+# ------------------------------------------------------------------------------------
+# Modified from VQGAN (https://github.com/CompVis/taming-transformers)
+# Copyright (c) 2020 Patrick Esser and Robin Rombach and Björn Ommer. All Rights Reserved.
+# ------------------------------------------------------------------------------------
+import torch
+import torch.nn as nn
+from typing import List, Tuple, Optional
+from einops import rearrange
+from omegaconf import OmegaConf
+from .layers import Encoder, Decoder
+class VectorQuantizer(nn.Module):
+    """
+    Simplified VectorQuantizer in the original VQGAN repository
+    by removing unncessary modules for sampling
+    """
+    def __init__(self, dim: int, n_embed: int, beta: float) -> None:
+        super().__init__()
+        self.n_embed = n_embed
+        self.dim = dim
+        self.beta = beta
+        self.embedding = nn.Embedding(self.n_embed, self.dim)
+        self.embedding.weight.data.uniform_(-1.0 / self.n_embed, 1.0 / self.n_embed)
+    def forward(self,
+                z: torch.FloatTensor) -> Tuple[torch.FloatTensor, torch.LongTensor]:
+        z = rearrange(z, 'b c h w -> b h w c').contiguous()  # [B,C,H,W] -> [B,H,W,C]
+        z_flattened = z.view(-1, self.dim)
+        d = torch.sum(z_flattened ** 2, dim=1, keepdim=True) + \
+            torch.sum(self.embedding.weight**2, dim=1) - 2 * \
+            torch.einsum('bd,dn->bn', z_flattened, rearrange(self.embedding.weight, 'n d -> d n'))
+        min_encoding_indices = torch.argmin(d, dim=1)
+        z_q = self.embedding(min_encoding_indices).view(z.shape)
+        return z_q, min_encoding_indices
+    def get_codebook_entry(self,
+                           indices: torch.LongTensor,
+                           shape: Optional[List[int]] = None) -> torch.FloatTensor:
+        z_q = self.embedding(indices)
+        if shape is not None:
+            z_q = z_q.view(shape)
+            z_q = z_q.permute(0, 3, 1, 2).contiguous()
+        return z_q
+class VQGAN(nn.Module):
+    def __init__(self, n_embed: int, embed_dim: int, hparams: OmegaConf) -> None:
+        super().__init__()
+        self.encoder = Encoder(**hparams)
+        self.decoder = Decoder(**hparams)
+        self.quantize = VectorQuantizer(dim=embed_dim, n_embed=n_embed, beta=0.25)
+        self.quant_conv = torch.nn.Conv2d(hparams.z_channels, embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(embed_dim, hparams.z_channels, 1)
+        self.latent_dim = hparams.attn_resolutions[0]
+    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
+        quant = self.encode(x)
+        dec = self.decode(quant)
+        return dec
+    def encode(self, x: torch.FloatTensor) -> torch.FloatTensor:
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        quant = self.quantize(h)[0]
+        quant = rearrange(quant, 'b h w c -> b c h w').contiguous()
+        return quant
+    def decode(self, quant: torch.FloatTensor) -> torch.FloatTensor:
+        quant = self.post_quant_conv(quant)
+        dec = self.decoder(quant)
+        return dec
+    def decode_code(self, code: torch.LongTensor) -> torch.FloatTensor:
+        quant = self.quantize.get_codebook_entry(code)
+        quant = quant.permute(0, 3, 1, 2)
+        dec = self.decode(quant)
+        return dec
+    def get_codes(self, x: torch.FloatTensor) -> torch.LongTensor:
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        codes = self.quantize(h)[1].view(x.shape[0], self.latent_dim ** 2)
+        return codes
+    def from_ckpt(self, path: str, strict: bool = True) -> None:
+        #ckpt = torch.load(path, map_location='cpu')['state_dict']
+        #self.load_state_dict(ckpt, strict=strict)
+        #print(f'{path} successfully restored..')
+        #ckpt = torch.load(path, map_location='cpu')['state_dict']
+        ckpt = torch.utils.model_zoo.load_url('https://utexas.box.com/shared/static/rpt9miyj2kikogyekpqnkd6y115xp51i.ckpt', map_location='cpu')['state_dict']
+        self.load_state_dict(ckpt, strict=True)
+        print(f'{path} succesfully restored..')

dalle/models/stage2/layers.py ADDED Viewed

	@@ -0,0 +1,140 @@

+# ------------------------------------------------------------------------------------
+# minDALL-E
+# Copyright (c) 2021 Kakao Brain Corp. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------
+# Modified from minGPT (https://github.com/karpathy/minGPT)
+# Copyright (c) 2020 Andrej Karpathy. All Rights Reserved.
+# ------------------------------------------------------------------------------------
+import math
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+class GELU(nn.Module):
+    def __init__(self, use_approx=False):
+        super().__init__()
+        self.use_approx = use_approx
+    def forward(self, x):
+        if self.use_approx:
+            return x * torch.sigmoid(1.702 * x)
+        else:
+            return F.gelu(x)
+class MultiHeadSelfAttention(nn.Module):
+    def __init__(self,
+                 ctx_len: int,
+                 embed_dim: int,
+                 n_heads: int,
+                 resid_pdrop: float,
+                 attn_pdrop: float,
+                 attn_bias: bool,
+                 use_mask: bool = True):
+        super().__init__()
+        assert embed_dim % n_heads == 0
+        # key, query, value projections for all heads
+        self.key = nn.Linear(embed_dim, embed_dim, bias=attn_bias)
+        self.query = nn.Linear(embed_dim, embed_dim, bias=attn_bias)
+        self.value = nn.Linear(embed_dim, embed_dim, bias=attn_bias)
+        # regularization
+        self.attn_drop = nn.Dropout(attn_pdrop)
+        self.resid_drop = nn.Dropout(resid_pdrop)
+        # output projection
+        self.proj = nn.Linear(embed_dim, embed_dim, attn_bias)
+        self.n_heads = n_heads
+        self.ctx_len = ctx_len
+        self.use_mask = use_mask
+        if self.use_mask:
+            self.register_buffer("mask", torch.ones(ctx_len, ctx_len), persistent=False)
+            self.mask = torch.tril(self.mask).view(1, ctx_len, ctx_len)
+    def forward(self, x, use_cache=False, layer_past=None):
+        B, T, C = x.shape
+        x = x.transpose(0, 1).contiguous()  # (B, T, C) -> (T, B, C)
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        k = self.key(x).view(T, B*self.n_heads, C//self.n_heads).transpose(0, 1)  # (B*nh, T, hs)
+        q = self.query(x).view(T, B*self.n_heads, C//self.n_heads).transpose(0, 1)  # (B*nh, T, hs)
+        v = self.value(x).view(T, B*self.n_heads, C//self.n_heads).transpose(0, 1)  # (B*nh, T, hs)
+        if use_cache:
+            present = torch.stack([k, v])
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            k = torch.cat([past_key, k], dim=-2)
+            v = torch.cat([past_value, v], dim=-2)
+        if use_cache and layer_past is not None:
+            # Tensor shape below: (B * nh, 1, hs) X (B * nh, hs, K) -> (B * nh, 1, K)
+            att = torch.bmm(q, (k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))))
+            att = F.softmax(att, dim=-1)
+            att = self.attn_drop(att)
+            y = torch.bmm(att, v)  # (B*nh, 1, K) X (B*nh, K, hs) -> (B*nh, 1, hs)
+        else:
+            # Tensor shape below: (B * nh, T, hs) X (B * nh, hs, T) -> (B * nh, T, T)
+            att = torch.bmm(q, (k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))))
+            if self.use_mask:
+                mask = self.mask if T == self.ctx_len else self.mask[:, :T, :T]
+                att = att.masked_fill(mask == 0, float('-inf'))
+            att = F.softmax(att, dim=-1)
+            att = self.attn_drop(att)
+            y = torch.bmm(att, v)  # (B*nh, T, T) X (B*nh, T, hs) -> (B*nh, T, hs)
+        y = y.transpose(0, 1).contiguous().view(T, B, C)  # re-assemble all head outputs side by side
+        # output projection
+        y = self.resid_drop(self.proj(y))
+        if use_cache:
+            return y.transpose(0, 1).contiguous(), present  # (T, B, C) -> (B, T, C)
+        else:
+            return y.transpose(0, 1).contiguous()  # (T, B, C) -> (B, T, C)
+class Block(nn.Module):
+    def __init__(self,
+                 ctx_len: int,
+                 embed_dim: int,
+                 n_heads: int,
+                 mlp_bias: bool,
+                 attn_bias: bool,
+                 resid_pdrop: bool,
+                 attn_pdrop: bool,
+                 gelu_use_approx: bool):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(embed_dim)
+        self.ln2 = nn.LayerNorm(embed_dim)
+        self.attn = MultiHeadSelfAttention(ctx_len=ctx_len,
+                                           embed_dim=embed_dim,
+                                           n_heads=n_heads,
+                                           attn_pdrop=attn_pdrop,
+                                           resid_pdrop=resid_pdrop,
+                                           attn_bias=attn_bias,
+                                           use_mask=True)
+        self.mlp = nn.Sequential(
+            nn.Linear(embed_dim, 4 * embed_dim, bias=mlp_bias),
+            GELU(gelu_use_approx),
+            nn.Linear(4 * embed_dim, embed_dim, bias=mlp_bias),
+            nn.Dropout(resid_pdrop),
+        )
+    def forward(self, x):
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+    def sample(self, x, layer_past=None):
+        attn, present = self.attn(self.ln1(x), use_cache=True, layer_past=layer_past)
+        x = x + attn
+        x = x + self.mlp(self.ln2(x))
+        return x, present

dalle/models/stage2/transformer.py ADDED Viewed

	@@ -0,0 +1,257 @@

+# ------------------------------------------------------------------------------------
+# minDALL-E
+# Copyright (c) 2021 Kakao Brain Corp. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------
+# Modified from minGPT (https://github.com/karpathy/minGPT)
+# Copyright (c) 2020 Andrej Karpathy. All Rights Reserved.
+# ------------------------------------------------------------------------------------
+import torch
+import torch.nn as nn
+from typing import Optional, Tuple, List
+from torch.cuda.amp import autocast
+from omegaconf import OmegaConf
+from .layers import Block
+import io
+class Transformer1d(nn.Module):
+    def __init__(self,
+                 vocab_size_txt: int,
+                 vocab_size_img: int,
+                 hparams: OmegaConf) -> None:
+        super().__init__()
+        assert hparams.n_layers == hparams.n_dense_layers
+        # input embedding for image and text
+        self.tok_emb_img = nn.Embedding(vocab_size_img, hparams.embed_dim)
+        self.tok_emb_txt = nn.Embedding(vocab_size_txt, hparams.embed_dim)
+        self.pos_emb_img = nn.Embedding(hparams.ctx_len_img, hparams.embed_dim)
+        self.pos_emb_txt = nn.Embedding(hparams.ctx_len_txt, hparams.embed_dim)
+        self.drop = nn.Dropout(hparams.embd_pdrop)
+        # transformer blocks
+        self.blocks = [Block(ctx_len=hparams.ctx_len_img + hparams.ctx_len_txt,
+                             embed_dim=hparams.embed_dim,
+                             n_heads=hparams.n_heads,
+                             mlp_bias=hparams.mlp_bias,
+                             attn_bias=hparams.attn_bias,
+                             resid_pdrop=hparams.resid_pdrop,
+                             attn_pdrop=hparams.attn_pdrop,
+                             gelu_use_approx=hparams.gelu_use_approx) for i in range(1, hparams.n_layers+1)]
+        self.blocks = nn.Sequential(*self.blocks)
+        # heads for image and text
+        self.ln_f = nn.LayerNorm(hparams.embed_dim)
+        self.head_img = nn.Linear(hparams.embed_dim, vocab_size_img, bias=False)
+        self.head_txt = nn.Linear(hparams.embed_dim, vocab_size_txt, bias=False)
+        self.ctx_len_img = hparams.ctx_len_img
+        self.ctx_len_txt = hparams.ctx_len_txt
+        self.n_layers = hparams.n_layers
+        self.apply(self._init_weights)
+    def _init_weights(self, module: nn.Module) -> None:
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def forward(self,
+                images: torch.LongTensor,
+                texts: torch.LongTensor,
+                pos_images: torch.LongTensor,
+                pos_texts: torch.LongTensor) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
+        B, T = images.shape
+        _, N = texts.shape
+        assert T <= self.ctx_len_img, "Already reached the maximum context length (image)."
+        assert N == self.ctx_len_txt, "Already reached the maximum context length (text)."
+        texts = self.tok_emb_txt(texts)
+        images = self.tok_emb_img(images)
+        texts = texts + self.pos_emb_txt(pos_texts)
+        images = images + self.pos_emb_img(pos_images)
+        x = torch.cat([texts, images], axis=1).contiguous()
+        x = self.drop(x)
+        x = self.blocks(x)
+        x = self.ln_f(x)
+        texts = x[:, :N-1].contiguous()
+        images = x[:, N-1:-1].contiguous()
+        logits_txt = self.head_txt(texts)
+        logits_img = self.head_img(images)
+        return logits_img, logits_txt
+    @torch.no_grad()
+    def sampling(self,
+                 images: torch.LongTensor,
+                 texts: torch.LongTensor,
+                 pos_images: torch.LongTensor,
+                 pos_texts: torch.LongTensor,
+                 use_fp16: bool = True,
+                 past: Optional[List[torch.Tensor]] = None) -> Tuple[torch.FloatTensor, List[torch.FloatTensor]]:
+        _, N = texts.shape
+        assert N == self.ctx_len_txt, "Already reached the maximum context length (text)."
+        with autocast(enabled=use_fp16):
+            if images is None:
+                assert past is None
+                texts = self.tok_emb_txt(texts)
+                x = texts + self.pos_emb_txt(pos_texts)
+                x = self.drop(x)
+                presents = []
+                for i, block in enumerate(self.blocks):
+                    x, present = block.sample(x, layer_past=None)
+                    presents.append(present)
+                x = self.ln_f(x)
+                x = x[:, N-1].contiguous()
+                logits = self.head_img(x)
+            else:
+                if past is None:
+                    texts = self.tok_emb_txt(texts)
+                    images = self.tok_emb_img(images)
+                    texts = texts + self.pos_emb_txt(pos_texts)
+                    images = images + self.pos_emb_img(pos_images)
+                    x = torch.cat([texts, images], axis=1).contiguous()
+                else:
+                    images = self.tok_emb_img(images)
+                    x = images + self.pos_emb_img(pos_images)
+                x = self.drop(x)
+                if past is not None:
+                    past = torch.cat(past, dim=-2)
+                presents = []
+                for i, block in enumerate(self.blocks):
+                    x, present = block.sample(x, layer_past=None if past is None else past[i])
+                    presents.append(present)
+                x = self.ln_f(x)
+                x = x[:, -1].contiguous()
+                logits = self.head_img(x)
+            return logits, presents
+    def from_ckpt(self, path: str) -> None:
+        #ckpt = torch.load(path, map_location='cpu')['state_dict']
+        ckpt = torch.utils.model_zoo.load_url('https://utexas.box.com/shared/static/54jc9fw0bious5nx6wvayeqaskcrdgv4.ckpt', map_location='cpu')['state_dict']
+        self.load_state_dict(ckpt, strict=True)
+        print(f'{path} succesfully restored..')
+class iGPT(nn.Module):
+    def __init__(self,
+                 vocab_size_img: int,
+                 use_cls_cond: bool,
+                 hparams: OmegaConf) -> None:
+        super().__init__()
+        self.use_cls_cond = use_cls_cond
+        # sos token embedding
+        if self.use_cls_cond:
+            self.sos = nn.Embedding(hparams.n_classes, hparams.embed_dim)
+        else:
+            self.sos = nn.Parameter(torch.randn(1, 1, hparams.embed_dim))
+        # input embedding
+        self.tok_emb_img = nn.Embedding(vocab_size_img, hparams.embed_dim)
+        self.pos_emb_img = nn.Embedding(hparams.ctx_len_img, hparams.embed_dim)
+        self.drop = nn.Dropout(hparams.embd_pdrop)
+        # transformer blocks
+        self.blocks = [Block(ctx_len=hparams.ctx_len_img + 1,
+                             embed_dim=hparams.embed_dim,
+                             n_heads=hparams.n_heads,
+                             mlp_bias=hparams.mlp_bias,
+                             attn_bias=hparams.attn_bias,
+                             resid_pdrop=hparams.resid_pdrop,
+                             attn_pdrop=hparams.attn_pdrop,
+                             gelu_use_approx=hparams.gelu_use_approx) for i in range(1, hparams.n_layers+1)]
+        self.blocks = nn.Sequential(*self.blocks)
+        # head
+        self.ln_f = nn.LayerNorm(hparams.embed_dim)
+        self.head = nn.Linear(hparams.embed_dim, vocab_size_img, bias=False)
+        self.ctx_len_img = hparams.ctx_len_img
+        self.n_layers = hparams.n_layers
+        self.apply(self._init_weights)
+    def _init_weights(self, module: nn.Module) -> None:
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    @torch.no_grad()
+    def sampling(self,
+                 sos: torch.FloatTensor,
+                 codes: torch.LongTensor,
+                 pos_codes: torch.LongTensor,
+                 n_samples: int = 16,
+                 use_fp16: bool = True,
+                 past: Optional[torch.Tensor] = None) -> Tuple[torch.FloatTensor, List[torch.FloatTensor]]:
+        with autocast(enabled=use_fp16):
+            if codes is None:
+                assert past is None
+                xs = self.drop(sos)
+                presents = []
+                for i, block in enumerate(self.blocks):
+                    xs, present = block.sample(xs, layer_past=None)
+                    presents.append(present)
+                xs = self.ln_f(xs)
+                logits = self.head(xs)[:, -1]
+            else:
+                if past is None:
+                    xs = self.tok_emb_img(codes) + self.pos_emb_img(pos_codes)
+                    xs = torch.cat([sos, xs], dim=1)
+                else:
+                    xs = self.tok_emb_img(codes) + self.pos_emb_img(pos_codes)
+                xs = self.drop(xs)
+                past = torch.cat(past, dim=-2) if past is not None else past
+                presents = []
+                for i, block in enumerate(self.blocks):
+                    xs, present = block.sample(xs, layer_past=None if past is None else past[i])
+                    presents.append(present)
+                xs = self.ln_f(xs)
+                logits = self.head(xs)[:, -1]
+            return logits, presents
+    def forward(self,
+                codes: torch.LongTensor,
+                labels: Optional[torch.LongTensor] = None) -> torch.FloatTensor:
+        B, T = codes.shape
+        xps = torch.arange(T, device=codes.device).repeat((B, 1))
+        sos = self.sos.repeat((B, 1, 1)) if labels is None else self.sos(labels).unsqueeze(1)
+        h = self.tok_emb_img(codes) + self.pos_emb_img(xps)
+        h = torch.cat([sos, h[:, :-1]], dim=1).contiguous()
+        h = self.drop(h)
+        h = self.blocks(h)
+        h = self.ln_f(h)
+        logits = self.head(h)
+        return logits
+    def from_ckpt(self, path: str, strict: bool = True) -> None:
+        ckpt = torch.load(path, map_location='cpu')['state_dict']
+        self.load_state_dict(ckpt, strict=strict)
+        print(f'{path} successfully restored..')

dalle/models/tokenizer.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# ------------------------------------------------------------------------------------
+# minDALL-E
+# Copyright (c) 2021 Kakao Brain Corp. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------
+import os
+from functools import partial
+from tokenizers import CharBPETokenizer
+def build_tokenizer(path: str,
+                    context_length: int = 64,
+                    *args,
+                    **kwargs):
+    from_file = partial(CharBPETokenizer.from_file,
+                        vocab_filename=os.path.join(path, 'bpe-16k-vocab.json'),
+                        merges_filename=os.path.join(path, 'bpe-16k-merges.txt'),
+                        unk_token='[UNK]')
+    tokenizer = from_file(*args, **kwargs)
+    tokenizer.add_special_tokens(['[PAD]'])
+    tokenizer.enable_padding(length=context_length,
+                             pad_id=tokenizer.token_to_id('[PAD]'))
+    tokenizer.enable_truncation(max_length=context_length)
+    print(f'{path} successfully restored..')
+    return tokenizer

dalle/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .utils import *
+from .config import *
+from .sampling import *

dalle/utils/config.py ADDED Viewed

	@@ -0,0 +1,123 @@

+# ------------------------------------------------------------------------------------
+# minDALL-E
+# Copyright (c) 2021 Kakao Brain Corp. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------
+from typing import Optional, List
+from dataclasses import dataclass, field
+from omegaconf import OmegaConf
+@dataclass
+class DataConfig:
+    dataset: Optional[str] = None
+    tokenizer_type: str = 'CharBPE'
+    context_length: int = 64
+    image_resolution: int = 256
+    transforms: str = 'dalle-vqvae'
+    bpe_pdrop: Optional[float] = None
+@dataclass
+class Stage1Hparams:
+    double_z: bool = False
+    z_channels: int = 256
+    resolution: int = 256
+    in_channels: int = 3
+    out_ch: int = 3
+    ch: int = 128
+    ch_mult: List[int] = field(default_factory=lambda: [1, 1, 2, 2, 4])
+    num_res_blocks: int = 2
+    attn_resolutions: List[int] = field(default_factory=lambda: [16])
+    pdrop: float = 0.0
+@dataclass
+class Stage2Hparams:
+    embed_dim: int = 1536
+    n_layers: int = 42
+    n_heads: int = 24
+    n_dense_layers: int = 42
+    ctx_len_img: int = 256
+    ctx_len_txt: int = 64
+    embd_pdrop: float = 0.0
+    resid_pdrop: float = 0.0
+    attn_pdrop: float = 0.0
+    mlp_bias: bool = True
+    attn_bias: bool = True
+    gelu_use_approx: bool = False
+    use_head_txt: bool = True
+    n_classes: Optional[int] = None
+@dataclass
+class Stage1Config:
+    type: str = 'vqgan'
+    embed_dim: int = 256
+    n_embed: int = 16384
+    hparams: Stage1Hparams = Stage1Hparams()
+@dataclass
+class Stage2Config:
+    type: str = 'transformer1d'
+    vocab_size_txt: int = 16384
+    vocab_size_img: int = 16384
+    use_cls_cond: Optional[bool] = None
+    hparams: Stage2Hparams = Stage2Hparams()
+@dataclass
+class WarmupConfig:
+    epoch: int = 1
+    multiplier: int = 1
+    buffer_epoch: int = 0
+    min_lr: float = 0.0
+    mode: str = 'fix'
+    peak_lr: float = 1e-4
+    start_from_zero: bool = True
+@dataclass
+class OptConfig:
+    opt_type: str = 'adamW'
+    base_lr: float = 1e-4
+    weight_decay: float = 1e-4
+    betas: List[float] = field(default_factory=lambda: [0.9, 0.99])
+    grad_clip_norm: float = 1.0
+    sched_type: str = 'cosine'
+    max_steps: int = 0
+    min_lr: float = 0.0
+@dataclass
+class ExpConfig:
+    local_batch_size: int = 4
+    total_batch_size: int = 512
+    valid_batch_size: int = 32
+    epochs: int = 10
+    save_ckpt_freq: int = 2
+    test_freq: int = 1
+    use_amp: bool = True
+@dataclass
+class DefaultConfig:
+    dataset: DataConfig = DataConfig()
+    stage1: Stage1Config = Stage1Config()
+    stage2: Stage2Config = Stage2Config()
+@dataclass
+class FineTuningConfig:
+    dataset: DataConfig = DataConfig()
+    stage1: Stage1Config = Stage1Config()
+    stage2: Stage2Config = Stage2Config()
+    optimizer: OptConfig = OptConfig()
+    experiment: ExpConfig = ExpConfig()
+def get_base_config(use_default=True):
+    return OmegaConf.structured(DefaultConfig if use_default else FineTuningConfig)

dalle/utils/sampling.py ADDED Viewed

	@@ -0,0 +1,162 @@

+# ------------------------------------------------------------------------------------
+# minDALL-E
+# Copyright (c) 2021 Kakao Brain Corp. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------
+import torch
+from typing import Optional
+from tqdm import tqdm
+from torch.nn import functional as F
+import streamlit as st
+def cutoff_topk_logits(logits: torch.FloatTensor, k: int) -> torch.FloatTensor:
+    if k is None:
+        return logits
+    else:
+        v, ix = torch.topk(logits, k)
+        out = logits.clone()
+        out[out < v[:, [-1]]] = -float('Inf')
+        return out
+def cutoff_topp_probs(probs: torch.FloatTensor, p: float) -> torch.FloatTensor:
+    if p is None:
+        return probs
+    else:
+        sorted_probs, sorted_indices = torch.sort(probs, dim=-1, descending=True)
+        cum_probs = torch.cumsum(sorted_probs, dim=-1)
+        sorted_idx_remove_cond = cum_probs >= p
+        sorted_idx_remove_cond[..., 1:] = sorted_idx_remove_cond[..., :-1].clone()
+        sorted_idx_remove_cond[..., 0] = 0
+        indices_to_remove = sorted_idx_remove_cond.scatter(-1, sorted_indices, sorted_idx_remove_cond)
+        probs = probs.masked_fill(indices_to_remove, 0.0)
+        norm_probs = probs / torch.sum(probs, dim=-1, keepdim=True)
+        return norm_probs
+def get_positional_encoding(inputs: torch.LongTensor, mode: str = '1d') -> torch.LongTensor:
+    device = inputs.device
+    if mode == '1d':
+        B, N = inputs.shape
+        xs_pos = torch.arange(N, device=device).repeat((B, 1))
+    elif mode == '2d':
+        B, H, W = inputs.shape
+        xs_pos_h = torch.arange(H, device=device).repeat(B, W, 1).transpose(1, 2)
+        xs_pos_w = torch.arange(W, device=device).repeat(B, H, 1)
+        xs_pos = (xs_pos_h, xs_pos_w)
+    else:
+        raise ValueError('%s positional encoding invalid' % mode)
+    return xs_pos
+@torch.no_grad()
+def sampling(model: torch.nn.Module,
+             tokens: torch.LongTensor,
+             top_k: Optional[float] = None,
+             top_p: Optional[float] = None,
+             softmax_temperature: float = 1.0,
+             is_tqdm: bool = True,
+             use_fp16: bool = True,
+             max_seq_len: int = 256) -> torch.LongTensor:
+    code = None
+    past = None
+    pbar = tqdm(range(max_seq_len), total=max_seq_len) if is_tqdm else range(max_seq_len)
+    pos_enc_tokens = get_positional_encoding(tokens, mode='1d')
+    #my_bar = st.progress(0)
+    for cnt, h in enumerate(pbar):
+        if code is None:
+            code_ = None
+            pos_enc_code_ = None
+        else:
+            code_ = code.clone().detach()
+            pos_enc_code_ = get_positional_encoding(code_, mode='1d')
+            code_ = code_[:, cnt-1].unsqueeze(-1)
+            pos_enc_code_ = pos_enc_code_[:, cnt-1].unsqueeze(-1)
+        logits, present = model.sampling(images=code_,
+                                         texts=tokens,
+                                         pos_images=pos_enc_code_,
+                                         pos_texts=pos_enc_tokens,
+                                         use_fp16=use_fp16,
+                                         past=past)
+        logits = logits.to(dtype=torch.float32)
+        logits = logits / softmax_temperature
+        present = torch.stack(present).clone().detach()
+        if past is None:
+            past = [present]
+        else:
+            past.append(present)
+        logits = cutoff_topk_logits(logits, top_k)
+        probs = F.softmax(logits, dim=-1)
+        probs = cutoff_topp_probs(probs, top_p)
+        idx = torch.multinomial(probs, num_samples=1).clone().detach()
+        code = idx if code is None else torch.cat([code, idx], axis=1)
+        #print(cnt/max_seq_len)
+        if(st.session_state.page != 0):
+            break
+        st.session_state.bar.progress(cnt/max_seq_len)
+        #my_bar.progress(cnt/max_seq_len)
+    del past
+    return code
+@torch.no_grad()
+def sampling_igpt(model: torch.nn.Module,
+                  sos: torch.FloatTensor,
+                  top_k: Optional[float] = None,
+                  top_p: Optional[float] = None,
+                  softmax_temperature: float = 1.0,
+                  is_tqdm: bool = True,
+                  use_fp16: bool = True,
+                  max_seq_len: int = 256) -> torch.LongTensor:
+    code = None
+    past = None
+    pbar = tqdm(range(max_seq_len), total=max_seq_len) if is_tqdm else range(max_seq_len)
+    for cnt, h in enumerate(pbar):
+        if code is None:
+            code_ = None
+            pos_enc_code_ = None
+        else:
+            code_ = code.clone().detach()
+            pos_enc_code_ = get_positional_encoding(code_, mode='1d')
+            code_ = code_[:, cnt-1].unsqueeze(-1)
+            pos_enc_code_ = pos_enc_code_[:, cnt-1].unsqueeze(-1)
+        logits, present = model.sampling(sos=sos,
+                                         codes=code_,
+                                         pos_codes=pos_enc_code_,
+                                         use_fp16=use_fp16,
+                                         past=past)
+        logits = logits.to(dtype=torch.float32)
+        logits = logits / softmax_temperature
+        present = torch.stack(present).clone().detach()
+        if past is None:
+            past = [present]
+        else:
+            past.append(present)
+        logits = cutoff_topk_logits(logits, top_k)
+        probs = F.softmax(logits, dim=-1)
+        probs = cutoff_topp_probs(probs, top_p)
+        idx = torch.multinomial(probs, num_samples=1).clone().detach()
+        code = idx if code is None else torch.cat([code, idx], axis=1)
+    del past
+    return code

dalle/utils/utils.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# ------------------------------------------------------------------------------------
+# minDALL-E
+# Copyright (c) 2021 Kakao Brain Corp. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------
+import os
+import random
+import urllib
+import hashlib
+import tarfile
+import torch
+import clip
+import numpy as np
+from PIL import Image
+from torch.nn import functional as F
+from tqdm import tqdm
+def set_seed(seed: int):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+@torch.no_grad()
+def clip_score(prompt: str,
+               images: np.ndarray,
+               model_clip: torch.nn.Module,
+               preprocess_clip,
+               device: str) -> np.ndarray:
+    images = [preprocess_clip(Image.fromarray((image*255).astype(np.uint8))) for image in images]
+    images = torch.stack(images, dim=0).to(device=device)
+    texts = clip.tokenize(prompt).to(device=device)
+    texts = torch.repeat_interleave(texts, images.shape[0], dim=0)
+    image_features = model_clip.encode_image(images)
+    text_features = model_clip.encode_text(texts)
+    scores = F.cosine_similarity(image_features, text_features).squeeze()
+    rank = torch.argsort(scores, descending=True).cpu().numpy()
+    return rank
+def download(url: str, root: str) -> str:
+    os.makedirs(root, exist_ok=True)
+    filename = os.path.basename(url)
+    pathname = filename[:-len('.tar.gz')]
+    expected_md5 = url.split("/")[-2]
+    download_target = os.path.join(root, filename)
+    result_path = os.path.join(root, pathname)
+    if os.path.isfile(download_target) and (os.path.exists(result_path) and not os.path.isfile(result_path)):
+        return result_path
+    with urllib.request.urlopen(url) as source, open(download_target, 'wb') as output:
+        with tqdm(total=int(source.info().get('Content-Length')), ncols=80, unit='iB', unit_scale=True,
+                  unit_divisor=1024) as loop:
+            while True:
+                buffer = source.read(8192)
+                if not buffer:
+                    break
+                output.write(buffer)
+                loop.update(len(buffer))
+    if hashlib.md5(open(download_target, 'rb').read()).hexdigest() != expected_md5:
+        raise RuntimeError(f'Model has been downloaded but the md5 checksum does not not match')
+    with tarfile.open(download_target, 'r:gz') as f:
+        pbar = tqdm(f.getmembers(), total=len(f.getmembers()))
+        for member in pbar:
+            pbar.set_description(f'extracting: {member.name} (size:{member.size // (1024 * 1024)}MB)')
+            f.extract(member=member, path=root)
+    return result_path
+def realpath_url_or_path(url_or_path: str, root: str = None) -> str:
+    if urllib.parse.urlparse(url_or_path).scheme in ('http', 'https'):
+        return download(url_or_path, root)
+    return url_or_path

page/generate.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import collections
+from numpy.core.defchararray import lower
+import streamlit as st
+import numpy as np
+import pandas as pd
+import streamlit as st
+import pandas as pd
+import numpy as np
+import os, random, time
+from utils import footer, generate, drawGrid
+from PIL import Image
+mode = "ai"
+#mode = "dummy"
+def app():
+    st.title('AI-Generated Architecture')
+    st.subheader('Describe a building, interior, or other architecture you would like to see.')
+    #Modern architecture museum with black brick and large windows.
+    prompt = st.text_input(label="",value="Modern architecture museum with black brick and large windows.")
+    st.text("")
+    with st.expander("Having trouble thinking of something? Click here to view examples."):
+        st.write("""
+            • Modern architecture museum with black brick and large windows.\n
+            • A prosaic, simple architecture.\n
+            • An urban, post-modern architecture with concrete and steel.\n
+            • A sleek urban interior design.
+        """)
+    st.text("")
+    crazy = st.slider('Temperature. This controls how "crazy" generated images are, where 0 is the least crazy.', 0.0, 1.0, 0.75)
+    k = st.slider('Top K. The higher the value, the higher quality the results tend to be at the cost of extra processing time.', 1, 10, 1)
+    if( 'results' not in st.session_state ):
+        st.session_state.results = []
+    holder = st.empty()
+    startButton = holder.button("Start")
+    already = []
+    print("-0-")
+    if startButton or hasattr(st.session_state, 'load_state'):
+        with st.spinner("Generating..."):
+            print("-1-")
+            holder.empty()
+            nextButton = holder.button("finished generating images")
+            st.session_state.load_state = True
+            placeholder = st.empty()
+            second = st.empty()
+            with second.container():
+                drawGrid()
+            while len(st.session_state.results) <= 15:
+                print("Length "+str(len(st.session_state.results)))
+                with placeholder.container():
+                    st.session_state.bar = placeholder.progress(0)
+                    if(nextButton):
+                        st.session_state.page = 1
+                        break
+                    generate(prompt,crazy,k)
+                with second.container():
+                    drawGrid()
+                #placeholder.empty()
+                #st.session_state.bar = placeholder.progress(0)
+                #drawGrid(placeholder)

page/reduce.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import collections
+from numpy.core.defchararray import lower
+import streamlit as st
+import numpy as np
+import pandas as pd
+from zipfile import ZipFile
+import io
+import os
+def dell(ix):
+    print("!!!!")
+    st.session_state.results.pop(ix)
+def app():
+    st.title('AI-Generated Architecture')
+    st.subheader('Choose which images you would like to remove from your working set.')
+    os.chdir(r"temp/")
+    all_files = os.listdir()
+    for f in all_files:
+        os.remove(f)
+    # create a ZipFile object
+    zipObj = ZipFile('ai_architecture.zip', 'w')
+    # Add multiple files to the zip
+    for ix,file in enumerate( st.session_state.results ):
+        file['image'].save("temp/"+str(ix)+".jpeg")
+        zipObj.write("temp/"+str(ix)+".jpeg")
+    zipObj.close()
+    st.download_button(
+         label="Download images as zip",
+         data=open('ai_architecture.zip', 'rb'),
+         file_name='ai_architecture.zip',
+         mime='application/zip'
+     )
+    deleteButtons = []
+    for ix,result in enumerate( st.session_state.results ):
+        with st.container():
+            col1,col2 = st.columns(2)
+            with col1:
+                st.image(result['image'])
+            with col2:
+                st.button("delete ", key=ix, on_click=dell, kwargs=dict(ix=ix) )
+            m = st.markdown("""
+            <hr />""", unsafe_allow_html=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+clip==0.2.0
+Cython==0.29.30
+clip_anytorch==2.4.0
+htbuilder==0.6.0
+iteration_utilities==0.11.0
+numpy==1.22.4
+omegaconf==2.2.2
+pages==0.3
+pandas==1.4.2
+Pillow==9.2.0
+pytorch_lightning==1.6.3
+ruclip==0.0.1
+rudalle==1.1.3
+streamlit==1.10.0
+tokenizers==0.12.1
+torch==1.8.0
+torchvision==0.9.0
+tqdm==4.64.0

streamlit_app.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import os, random, time
+from utils import footer
+from page import generate, reduce
+if( hasattr(st.session_state, 'page') == False):
+    st.session_state.page = 0
+if( hasattr(st.session_state, 'results') == False):
+    st.session_state.results = []
+p1 = st.empty()
+p2 = st.empty()
+p3 = st.empty()
+st.session_state.stop = False
+st.session_state.progress = 0
+st.session_state.regenerate = False
+if(st.session_state.page == 0):
+    p2.empty()
+    p3.empty()
+    with p1.container():
+        generate.app()
+if(st.session_state.page == 1):
+    p1.empty()
+    p3.empty()
+    with p2.container():
+        reduce.app()
+if(st.session_state.page == 2):
+    p1.empty()
+    p2.empty()
+    with p3.container():
+        st.write("This 333")
+        startButton = st.button("S3")
+        if startButton:
+            st.session_state.page = 0
+footer()

utils.py ADDED Viewed

	@@ -0,0 +1,160 @@

+from htbuilder import HtmlElement, div, ul, li, br, hr, a, p, img, styles, classes, fonts
+from htbuilder.units import percent, px
+from htbuilder.funcs import rgba, rgb
+import streamlit as st
+import os
+import sys
+import argparse
+import clip
+import numpy as np
+from PIL import Image
+from dalle.models import Dalle
+from dalle.utils.utils import set_seed, clip_score
+def link(link, text, **style):
+    return a(_href=link, _target="_blank", style=styles(**style))(text)
+def layout(*args):
+    style = """
+    <style>
+      # MainMenu {visibility: hidden;}
+      footer {visibility: hidden;}
+     .stApp { bottom: 105px; }
+    </style>
+    """
+    style_div = styles(
+        position="fixed",
+        left=0,
+        bottom=0,
+        margin=px(0, 0, 0, 0),
+        width=percent(100),
+        color="black",
+        text_align="center",
+        height="auto",
+        opacity=1
+    )
+    style_hr = styles(
+        display="block",
+        margin=px(8, 8, "auto", "auto"),
+        border_style="inset",
+        border_width=px(2)
+    )
+    body = p()
+    foot = div(
+        style=style_div
+    )(
+        hr(
+            style=style_hr
+        ),
+        body
+    )
+    st.markdown(style, unsafe_allow_html=True)
+    for arg in args:
+        if isinstance(arg, str):
+            body(arg)
+        elif isinstance(arg, HtmlElement):
+            body(arg)
+    st.markdown(str(foot), unsafe_allow_html=True)
+def footer():
+    myargs = [
+        "Created by ",
+        link("https://jonathanmalott.com", "Jonathan Malott"),
+        br(),
+        link("https://bridgingbarriers.utexas.edu/good-systems", "Good Systems"),
+        " Grand Challenge",
+        ", The University of Texas at Austin.",
+        " Advised by Dr. Junfeng Jiao.",
+        br(),
+        br(),
+    ]
+    layout(*myargs)
+#footer()
+def generate(prompt,crazy,k):
+    device = 'cpu'
+    print("-2-")
+    model = Dalle.from_pretrained('.cache/minDALL-E/1.3B')  # This will automatically download the pretrained model.
+    print("-3-")
+    model.to(device=device)
+    num_candidates = 1
+    images = []
+    set_seed(np.random.randint(0,10000))
+    # Sampling
+    images = model.sampling(prompt=prompt,
+                            top_k=2048,
+                            top_p=None,
+                            softmax_temperature=crazy,
+                            num_candidates=num_candidates,
+                            device=device).cpu().numpy()
+    images = np.transpose(images, (0, 2, 3, 1))
+    # CLIP Re-ranking
+    model_clip, preprocess_clip = clip.load("ViT-B/32", device=device)
+    model_clip.to(device=device)
+    rank = clip_score(prompt=prompt,
+                      images=images,
+                      model_clip=model_clip,
+                      preprocess_clip=preprocess_clip,
+                      device=device)
+    result = images[rank]
+    item = {}
+    item['prompt'] = prompt
+    item['crazy'] = crazy
+    item['k'] = k
+    item['image'] = Image.fromarray((result*255).astype(np.uint8))
+    st.session_state.results.append(item)
+def drawGrid():
+    master = {}
+    order = 0
+    #print(st.session_state.results)
+    for r in st.session_state.results[::-1]:
+        _txt = r['prompt']+" "+str(r['crazy'])+" "+str(r['k'])
+        if(_txt not in master):
+            master[_txt] = [r]
+            order += 1
+        else:
+            master[_txt].append(r)
+    for m in master:
+        #with placeholder.container():
+        txt = master[m][0]['prompt']+" (temperature:"+ str(master[m][0]['crazy']) + ", top k:" + str(master[m][0]['k']) + ")"
+        st.subheader(txt)
+        col1, col2, col3 = st.columns(3)
+        for ix, item in enumerate(master[m]):
+            if ix % 3 == 0:
+                with col1:
+                    st.image(item["image"])
+            if ix % 3 == 1:
+                with col2:
+                    st.image(item["image"])
+            if ix % 3 == 2:
+                with col3:
+                    st.image(item["image"])