Spaces:

fun-research
/

TiTok

Running on Zero

App Files Files Community

yucornetto commited on Jun 21, 2024

Commit

dada74e

verified ·

1 Parent(s): cd8845c

Upload 20 files

Browse files

Files changed (21) hide show

.gitattributes +2 -0
LICENSE +201 -0
README.md +4 -9
app.py +92 -0
assets/ILSVRC2012_val_00008636.png +0 -0
assets/ILSVRC2012_val_00010240.png +0 -0
assets/random_vis_l32.png +3 -0
assets/recon_w_model_size_num_token.png +3 -0
assets/speed_vs_perf.png +0 -0
assets/titok_teaser.png +0 -0
configs/titok_l32.yaml +29 -0
demo.ipynb +0 -0
demo_util.py +81 -0
imagenet_classes.py +1001 -0
modeling/__init__.py +15 -0
modeling/blocks.py +224 -0
modeling/maskgit.py +138 -0
modeling/maskgit_vqgan.py +362 -0
modeling/quantizer.py +92 -0
modeling/titok.py +97 -0
requirements.txt +11 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/random_vis_l32.png filter=lfs diff=lfs merge=lfs -text
+assets/recon_w_model_size_num_token.png filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,13 +1,8 @@
----
 title: TiTok
-emoji: 😻
 colorFrom: indigo
-colorTo: blue
 sdk: gradio
-sdk_version: 4.36.1
 app_file: app.py
-pinned: false
-license: apache-2.0
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 title: TiTok
+emoji: 🏆
 colorFrom: indigo
+colorTo: pink
 sdk: gradio
+sdk_version: 4.36.0
 app_file: app.py
+pinned: false

app.py ADDED Viewed

	@@ -0,0 +1,92 @@

+# Reference: https://huggingface.co/spaces/FoundationVision/LlamaGen/blob/main/app.py
+from PIL import Image
+import gradio as gr
+from imagenet_classes import imagenet_idx2classname
+from huggingface_hub import hf_hub_download
+import torch
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+import time
+import argparse
+import demo_util
+import os
+device = "cuda"
+model2ckpt = {
+    "TiTok-L-32": ("tokenizer_titok_l32.bin", "generator_titok_l32.bin"),
+}
+if not os.path.exists("tokenizer_titok_l32.bin"):
+    os.system("gdown 1I_m2Vm4JgQsa7bZVORj-nVhP8fgQLngd")
+if not os.path.exists("generator_titok_l32.bin"):
+    os.system("gdown 1IgqZ_vwGIj2ZWOPuCzilxeQ2UrMVY93l")
+parser = argparse.ArgumentParser()
+parser.add_argument("--precision", type=str, default='bf16', choices=["none", "fp16", "bf16"])
+parser.add_argument("--guidance_scale", type=float, default=3.5)
+parser.add_argument("--randomize_temperature", type=float, default=1.0)
+parser.add_argument("--num_sample_steps", type=int, default=8)
+parser.add_argument("--seed", type=int, default=42)
+parser.add_argument("--temperature", type=float, default=1.0, help="temperature value to sample with")
+args = parser.parse_args()
+config = demo_util.get_config("configs/titok_l32.yaml")
+print(config)
+titok_tokenizer = demo_util.get_titok_tokenizer(config)
+print(titok_tokenizer)
+titok_generator = demo_util.get_titok_generator(config)
+print(titok_generator)
+titok_tokenizer = titok_tokenizer.to(device)
+titok_generator = titok_generator.to(device)
+def demo_infer(guidance_scale, randomize_temperature, num_sample_steps,
+               class_label, seed):
+    n = 4
+    class_labels = [class_label for _ in range(n)]
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    t1 = time.time()
+    generated_image = demo_util.sample_fn(
+        generator=titok_generator,
+        tokenizer=titok_tokenizer,
+        labels=class_labels,
+        guidance_scale=guidance_scale,
+        randomize_temperature=randomize_temperature,
+        num_sample_steps=num_sample_steps,
+        device=device
+    )
+    sampling_time = time.time() - t1
+    print(f"generation takes about {sampling_time:.2f} seconds.")
+    samples = [Image.fromarray(sample) for sample in generated_image]
+    return samples
+with gr.Blocks() as demo:
+    gr.Markdown("<h1 style='text-align: center'>An Image is Worth 32 Tokens for Reconstruction and Generation</h1>")
+    with gr.Tabs():
+        with gr.TabItem('Generate'):
+            with gr.Row():
+                with gr.Column():
+                    with gr.Row():
+                        i1k_class = gr.Dropdown(
+                            list(imagenet_idx2classname.values()),
+                            value='macaw',
+                            type="index", label='ImageNet-1K Class'
+                        )
+                    guidance_scale = gr.Slider(minimum=1, maximum=25, step=0.1, value=3.5, label='Classifier-free Guidance Scale')
+                    randomize_temperature = gr.Slider(minimum=0., maximum=10.0, step=0.1, value=1.0, label='randomize_temperature')
+                    num_sample_steps = gr.Slider(minimum=1, maximum=32, step=1, value=8, label='num_sample_steps')
+                    seed = gr.Slider(minimum=0, maximum=1000, step=1, value=42, label='Seed')
+                    button = gr.Button("Generate", variant="primary")
+                with gr.Column():
+                    output = gr.Gallery(label='Generated Images', height=700)
+                    button.click(demo_util.sample_fn, inputs=[
+                        guidance_scale, randomize_temperature, num_sample_steps,
+                        i1k_class, seed],
+                        outputs=[output])
+    demo.queue()
+    demo.launch(debug=True)

assets/ILSVRC2012_val_00008636.png ADDED Viewed

assets/ILSVRC2012_val_00010240.png ADDED Viewed

assets/random_vis_l32.png ADDED Viewed

Git LFS Details

SHA256: ff40d0274f7d6656791e4fc72afbf0d46b0a3975803d6184a46baac0ab80438e
Pointer size: 132 Bytes
Size of remote file: 2.27 MB

assets/recon_w_model_size_num_token.png ADDED Viewed

Git LFS Details

SHA256: 8e5fe53bb8aa64fe918a33de92ac2d965d46871298eeec6fcd2a4a00f1b75386
Pointer size: 132 Bytes
Size of remote file: 1.49 MB

assets/speed_vs_perf.png ADDED Viewed

assets/titok_teaser.png ADDED Viewed

configs/titok_l32.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+experiment:
+    tokenizer_checkpoint: "tokenizer_titok_l32.bin"
+    generator_checkpoint: "generator_titok_l32.bin"
+model:
+    vq_model:
+        codebook_size: 4096
+        token_size: 12
+        use_l2_norm: True
+        commitment_cost: 0.25
+        # vit arch
+        vit_enc_model_size: "large"
+        vit_dec_model_size: "large"
+        vit_enc_patch_size: 16
+        vit_dec_patch_size: 16
+        num_latent_tokens: 32
+    generator:
+        dropout: 0.1
+        attn_drop: 0.1
+        num_steps: 8
+        mask_schedule_strategy: "arccos"
+        class_label_dropout: 0.1
+        image_seq_len: ${model.vq_model.num_latent_tokens}
+        condition_num_classes: 1000
+dataset:
+    preprocessing:
+        crop_size: 256

demo.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

demo_util.py ADDED Viewed

	@@ -0,0 +1,81 @@

+"""Demo file for sampling images from TiTok.
+Copyright (2024) Bytedance Ltd. and/or its affiliates
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import torch
+from omegaconf import OmegaConf
+from modeling.titok import TiTok
+from modeling.maskgit import ImageBert
+def get_config_cli():
+    cli_conf = OmegaConf.from_cli()
+    yaml_conf = OmegaConf.load(cli_conf.config)
+    conf = OmegaConf.merge(yaml_conf, cli_conf)
+    return conf
+def get_config(config_path):
+    conf = OmegaConf.load(config_path)
+    return conf
+def get_titok_tokenizer(config):
+    tokenizer = TiTok(config)
+    tokenizer.load_state_dict(torch.load(config.experiment.tokenizer_checkpoint))
+    tokenizer.eval()
+    tokenizer.requires_grad_(False)
+    return tokenizer
+def get_titok_generator(config):
+    generator = ImageBert(config)
+    generator.load_state_dict(torch.load(config.experiment.generator_checkpoint))
+    generator.eval()
+    generator.requires_grad_(False)
+    return generator
+@torch.no_grad()
+def sample_fn(generator,
+              tokenizer,
+              labels=None,
+              guidance_scale=3.0,
+              randomize_temperature=2.0,
+              num_sample_steps=8,
+              device="cuda"):
+    generator.eval()
+    tokenizer.eval()
+    if labels is None:
+        # goldfish, chicken, tiger, cat, hourglass, ship, dog, race car, airliner, teddy bear, random
+        labels = [1, 7, 282, 604, 724, 179, 751, 404, 850, torch.randint(0, 999, size=(1,))]
+    labels = torch.LongTensor(labels).to(device)
+    generated_tokens = generator.generate(
+        condition=labels,
+        guidance_scale=guidance_scale,
+        randomize_temperature=randomize_temperature,
+        num_sample_steps=num_sample_steps)
+    generated_image = tokenizer.decode_tokens(
+        generated_tokens.view(generated_tokens.shape[0], -1)
+    )
+    generated_image = torch.clamp(generated_image, 0.0, 1.0)
+    generated_image = (generated_image * 255.0).permute(0, 2, 3, 1).to("cpu", dtype=torch.uint8).numpy()
+    return generated_image

imagenet_classes.py ADDED Viewed

	@@ -0,0 +1,1001 @@

+imagenet_idx2classname = {
+ 0: 'tench, Tinca tinca',
+ 1: 'goldfish, Carassius auratus',
+ 2: 'great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias',
+ 3: 'tiger shark, Galeocerdo cuvieri',
+ 4: 'hammerhead, hammerhead shark',
+ 5: 'electric ray, crampfish, numbfish, torpedo',
+ 6: 'stingray',
+ 7: 'cock',
+ 8: 'hen',
+ 9: 'ostrich, Struthio camelus',
+ 10: 'brambling, Fringilla montifringilla',
+ 11: 'goldfinch, Carduelis carduelis',
+ 12: 'house finch, linnet, Carpodacus mexicanus',
+ 13: 'junco, snowbird',
+ 14: 'indigo bunting, indigo finch, indigo bird, Passerina cyanea',
+ 15: 'robin, American robin, Turdus migratorius',
+ 16: 'bulbul',
+ 17: 'jay',
+ 18: 'magpie',
+ 19: 'chickadee',
+ 20: 'water ouzel, dipper',
+ 21: 'kite',
+ 22: 'bald eagle, American eagle, Haliaeetus leucocephalus',
+ 23: 'vulture',
+ 24: 'great grey owl, great gray owl, Strix nebulosa',
+ 25: 'European fire salamander, Salamandra salamandra',
+ 26: 'common newt, Triturus vulgaris',
+ 27: 'eft',
+ 28: 'spotted salamander, Ambystoma maculatum',
+ 29: 'axolotl, mud puppy, Ambystoma mexicanum',
+ 30: 'bullfrog, Rana catesbeiana',
+ 31: 'tree frog, tree-frog',
+ 32: 'tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui',
+ 33: 'loggerhead, loggerhead turtle, Caretta caretta',
+ 34: 'leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea',
+ 35: 'mud turtle',
+ 36: 'terrapin',
+ 37: 'box turtle, box tortoise',
+ 38: 'banded gecko',
+ 39: 'common iguana, iguana, Iguana iguana',
+ 40: 'American chameleon, anole, Anolis carolinensis',
+ 41: 'whiptail, whiptail lizard',
+ 42: 'agama',
+ 43: 'frilled lizard, Chlamydosaurus kingi',
+ 44: 'alligator lizard',
+ 45: 'Gila monster, Heloderma suspectum',
+ 46: 'green lizard, Lacerta viridis',
+ 47: 'African chameleon, Chamaeleo chamaeleon',
+ 48: 'Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis',
+ 49: 'African crocodile, Nile crocodile, Crocodylus niloticus',
+ 50: 'American alligator, Alligator mississipiensis',
+ 51: 'triceratops',
+ 52: 'thunder snake, worm snake, Carphophis amoenus',
+ 53: 'ringneck snake, ring-necked snake, ring snake',
+ 54: 'hognose snake, puff adder, sand viper',
+ 55: 'green snake, grass snake',
+ 56: 'king snake, kingsnake',
+ 57: 'garter snake, grass snake',
+ 58: 'water snake',
+ 59: 'vine snake',
+ 60: 'night snake, Hypsiglena torquata',
+ 61: 'boa constrictor, Constrictor constrictor',
+ 62: 'rock python, rock snake, Python sebae',
+ 63: 'Indian cobra, Naja naja',
+ 64: 'green mamba',
+ 65: 'sea snake',
+ 66: 'horned viper, cerastes, sand viper, horned asp, Cerastes cornutus',
+ 67: 'diamondback, diamondback rattlesnake, Crotalus adamanteus',
+ 68: 'sidewinder, horned rattlesnake, Crotalus cerastes',
+ 69: 'trilobite',
+ 70: 'harvestman, daddy longlegs, Phalangium opilio',
+ 71: 'scorpion',
+ 72: 'black and gold garden spider, Argiope aurantia',
+ 73: 'barn spider, Araneus cavaticus',
+ 74: 'garden spider, Aranea diademata',
+ 75: 'black widow, Latrodectus mactans',
+ 76: 'tarantula',
+ 77: 'wolf spider, hunting spider',
+ 78: 'tick',
+ 79: 'centipede',
+ 80: 'black grouse',
+ 81: 'ptarmigan',
+ 82: 'ruffed grouse, partridge, Bonasa umbellus',
+ 83: 'prairie chicken, prairie grouse, prairie fowl',
+ 84: 'peacock',
+ 85: 'quail',
+ 86: 'partridge',
+ 87: 'African grey, African gray, Psittacus erithacus',
+ 88: 'macaw',
+ 89: 'sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita',
+ 90: 'lorikeet',
+ 91: 'coucal',
+ 92: 'bee eater',
+ 93: 'hornbill',
+ 94: 'hummingbird',
+ 95: 'jacamar',
+ 96: 'toucan',
+ 97: 'drake',
+ 98: 'red-breasted merganser, Mergus serrator',
+ 99: 'goose',
+ 100: 'black swan, Cygnus atratus',
+ 101: 'tusker',
+ 102: 'echidna, spiny anteater, anteater',
+ 103: 'platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus',
+ 104: 'wallaby, brush kangaroo',
+ 105: 'koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus',
+ 106: 'wombat',
+ 107: 'jellyfish',
+ 108: 'sea anemone, anemone',
+ 109: 'brain coral',
+ 110: 'flatworm, platyhelminth',
+ 111: 'nematode, nematode worm, roundworm',
+ 112: 'conch',
+ 113: 'snail',
+ 114: 'slug',
+ 115: 'sea slug, nudibranch',
+ 116: 'chiton, coat-of-mail shell, sea cradle, polyplacophore',
+ 117: 'chambered nautilus, pearly nautilus, nautilus',
+ 118: 'Dungeness crab, Cancer magister',
+ 119: 'rock crab, Cancer irroratus',
+ 120: 'fiddler crab',
+ 121: 'king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica',
+ 122: 'American lobster, Northern lobster, Maine lobster, Homarus americanus',
+ 123: 'spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish',
+ 124: 'crayfish, crawfish, crawdad, crawdaddy',
+ 125: 'hermit crab',
+ 126: 'isopod',
+ 127: 'white stork, Ciconia ciconia',
+ 128: 'black stork, Ciconia nigra',
+ 129: 'spoonbill',
+ 130: 'flamingo',
+ 131: 'little blue heron, Egretta caerulea',
+ 132: 'American egret, great white heron, Egretta albus',
+ 133: 'bittern',
+ 134: 'crane',
+ 135: 'limpkin, Aramus pictus',
+ 136: 'European gallinule, Porphyrio porphyrio',
+ 137: 'American coot, marsh hen, mud hen, water hen, Fulica americana',
+ 138: 'bustard',
+ 139: 'ruddy turnstone, Arenaria interpres',
+ 140: 'red-backed sandpiper, dunlin, Erolia alpina',
+ 141: 'redshank, Tringa totanus',
+ 142: 'dowitcher',
+ 143: 'oystercatcher, oyster catcher',
+ 144: 'pelican',
+ 145: 'king penguin, Aptenodytes patagonica',
+ 146: 'albatross, mollymawk',
+ 147: 'grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus',
+ 148: 'killer whale, killer, orca, grampus, sea wolf, Orcinus orca',
+ 149: 'dugong, Dugong dugon',
+ 150: 'sea lion',
+ 151: 'Chihuahua',
+ 152: 'Japanese spaniel',
+ 153: 'Maltese dog, Maltese terrier, Maltese',
+ 154: 'Pekinese, Pekingese, Peke',
+ 155: 'Shih-Tzu',
+ 156: 'Blenheim spaniel',
+ 157: 'papillon',
+ 158: 'toy terrier',
+ 159: 'Rhodesian ridgeback',
+ 160: 'Afghan hound, Afghan',
+ 161: 'basset, basset hound',
+ 162: 'beagle',
+ 163: 'bloodhound, sleuthhound',
+ 164: 'bluetick',
+ 165: 'black-and-tan coonhound',
+ 166: 'Walker hound, Walker foxhound',
+ 167: 'English foxhound',
+ 168: 'redbone',
+ 169: 'borzoi, Russian wolfhound',
+ 170: 'Irish wolfhound',
+ 171: 'Italian greyhound',
+ 172: 'whippet',
+ 173: 'Ibizan hound, Ibizan Podenco',
+ 174: 'Norwegian elkhound, elkhound',
+ 175: 'otterhound, otter hound',
+ 176: 'Saluki, gazelle hound',
+ 177: 'Scottish deerhound, deerhound',
+ 178: 'Weimaraner',
+ 179: 'Staffordshire bullterrier, Staffordshire bull terrier',
+ 180: 'American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier',
+ 181: 'Bedlington terrier',
+ 182: 'Border terrier',
+ 183: 'Kerry blue terrier',
+ 184: 'Irish terrier',
+ 185: 'Norfolk terrier',
+ 186: 'Norwich terrier',
+ 187: 'Yorkshire terrier',
+ 188: 'wire-haired fox terrier',
+ 189: 'Lakeland terrier',
+ 190: 'Sealyham terrier, Sealyham',
+ 191: 'Airedale, Airedale terrier',
+ 192: 'cairn, cairn terrier',
+ 193: 'Australian terrier',
+ 194: 'Dandie Dinmont, Dandie Dinmont terrier',
+ 195: 'Boston bull, Boston terrier',
+ 196: 'miniature schnauzer',
+ 197: 'giant schnauzer',
+ 198: 'standard schnauzer',
+ 199: 'Scotch terrier, Scottish terrier, Scottie',
+ 200: 'Tibetan terrier, chrysanthemum dog',
+ 201: 'silky terrier, Sydney silky',
+ 202: 'soft-coated wheaten terrier',
+ 203: 'West Highland white terrier',
+ 204: 'Lhasa, Lhasa apso',
+ 205: 'flat-coated retriever',
+ 206: 'curly-coated retriever',
+ 207: 'golden retriever',
+ 208: 'Labrador retriever',
+ 209: 'Chesapeake Bay retriever',
+ 210: 'German short-haired pointer',
+ 211: 'vizsla, Hungarian pointer',
+ 212: 'English setter',
+ 213: 'Irish setter, red setter',
+ 214: 'Gordon setter',
+ 215: 'Brittany spaniel',
+ 216: 'clumber, clumber spaniel',
+ 217: 'English springer, English springer spaniel',
+ 218: 'Welsh springer spaniel',
+ 219: 'cocker spaniel, English cocker spaniel, cocker',
+ 220: 'Sussex spaniel',
+ 221: 'Irish water spaniel',
+ 222: 'kuvasz',
+ 223: 'schipperke',
+ 224: 'groenendael',
+ 225: 'malinois',
+ 226: 'briard',
+ 227: 'kelpie',
+ 228: 'komondor',
+ 229: 'Old English sheepdog, bobtail',
+ 230: 'Shetland sheepdog, Shetland sheep dog, Shetland',
+ 231: 'collie',
+ 232: 'Border collie',
+ 233: 'Bouvier des Flandres, Bouviers des Flandres',
+ 234: 'Rottweiler',
+ 235: 'German shepherd, German shepherd dog, German police dog, alsatian',
+ 236: 'Doberman, Doberman pinscher',
+ 237: 'miniature pinscher',
+ 238: 'Greater Swiss Mountain dog',
+ 239: 'Bernese mountain dog',
+ 240: 'Appenzeller',
+ 241: 'EntleBucher',
+ 242: 'boxer',
+ 243: 'bull mastiff',
+ 244: 'Tibetan mastiff',
+ 245: 'French bulldog',
+ 246: 'Great Dane',
+ 247: 'Saint Bernard, St Bernard',
+ 248: 'Eskimo dog, husky',
+ 249: 'malamute, malemute, Alaskan malamute',
+ 250: 'Siberian husky',
+ 251: 'dalmatian, coach dog, carriage dog',
+ 252: 'affenpinscher, monkey pinscher, monkey dog',
+ 253: 'basenji',
+ 254: 'pug, pug-dog',
+ 255: 'Leonberg',
+ 256: 'Newfoundland, Newfoundland dog',
+ 257: 'Great Pyrenees',
+ 258: 'Samoyed, Samoyede',
+ 259: 'Pomeranian',
+ 260: 'chow, chow chow',
+ 261: 'keeshond',
+ 262: 'Brabancon griffon',
+ 263: 'Pembroke, Pembroke Welsh corgi',
+ 264: 'Cardigan, Cardigan Welsh corgi',
+ 265: 'toy poodle',
+ 266: 'miniature poodle',
+ 267: 'standard poodle',
+ 268: 'Mexican hairless',
+ 269: 'timber wolf, grey wolf, gray wolf, Canis lupus',
+ 270: 'white wolf, Arctic wolf, Canis lupus tundrarum',
+ 271: 'red wolf, maned wolf, Canis rufus, Canis niger',
+ 272: 'coyote, prairie wolf, brush wolf, Canis latrans',
+ 273: 'dingo, warrigal, warragal, Canis dingo',
+ 274: 'dhole, Cuon alpinus',
+ 275: 'African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus',
+ 276: 'hyena, hyaena',
+ 277: 'red fox, Vulpes vulpes',
+ 278: 'kit fox, Vulpes macrotis',
+ 279: 'Arctic fox, white fox, Alopex lagopus',
+ 280: 'grey fox, gray fox, Urocyon cinereoargenteus',
+ 281: 'tabby, tabby cat',
+ 282: 'tiger cat',
+ 283: 'Persian cat',
+ 284: 'Siamese cat, Siamese',
+ 285: 'Egyptian cat',
+ 286: 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor',
+ 287: 'lynx, catamount',
+ 288: 'leopard, Panthera pardus',
+ 289: 'snow leopard, ounce, Panthera uncia',
+ 290: 'jaguar, panther, Panthera onca, Felis onca',
+ 291: 'lion, king of beasts, Panthera leo',
+ 292: 'tiger, Panthera tigris',
+ 293: 'cheetah, chetah, Acinonyx jubatus',
+ 294: 'brown bear, bruin, Ursus arctos',
+ 295: 'American black bear, black bear, Ursus americanus, Euarctos americanus',
+ 296: 'ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus',
+ 297: 'sloth bear, Melursus ursinus, Ursus ursinus',
+ 298: 'mongoose',
+ 299: 'meerkat, mierkat',
+ 300: 'tiger beetle',
+ 301: 'ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle',
+ 302: 'ground beetle, carabid beetle',
+ 303: 'long-horned beetle, longicorn, longicorn beetle',
+ 304: 'leaf beetle, chrysomelid',
+ 305: 'dung beetle',
+ 306: 'rhinoceros beetle',
+ 307: 'weevil',
+ 308: 'fly',
+ 309: 'bee',
+ 310: 'ant, emmet, pismire',
+ 311: 'grasshopper, hopper',
+ 312: 'cricket',
+ 313: 'walking stick, walkingstick, stick insect',
+ 314: 'cockroach, roach',
+ 315: 'mantis, mantid',
+ 316: 'cicada, cicala',
+ 317: 'leafhopper',
+ 318: 'lacewing, lacewing fly',
+ 319: "dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk",
+ 320: 'damselfly',
+ 321: 'admiral',
+ 322: 'ringlet, ringlet butterfly',
+ 323: 'monarch, monarch butterfly, milkweed butterfly, Danaus plexippus',
+ 324: 'cabbage butterfly',
+ 325: 'sulphur butterfly, sulfur butterfly',
+ 326: 'lycaenid, lycaenid butterfly',
+ 327: 'starfish, sea star',
+ 328: 'sea urchin',
+ 329: 'sea cucumber, holothurian',
+ 330: 'wood rabbit, cottontail, cottontail rabbit',
+ 331: 'hare',
+ 332: 'Angora, Angora rabbit',
+ 333: 'hamster',
+ 334: 'porcupine, hedgehog',
+ 335: 'fox squirrel, eastern fox squirrel, Sciurus niger',
+ 336: 'marmot',
+ 337: 'beaver',
+ 338: 'guinea pig, Cavia cobaya',
+ 339: 'sorrel',
+ 340: 'zebra',
+ 341: 'hog, pig, grunter, squealer, Sus scrofa',
+ 342: 'wild boar, boar, Sus scrofa',
+ 343: 'warthog',
+ 344: 'hippopotamus, hippo, river horse, Hippopotamus amphibius',
+ 345: 'ox',
+ 346: 'water buffalo, water ox, Asiatic buffalo, Bubalus bubalis',
+ 347: 'bison',
+ 348: 'ram, tup',
+ 349: 'bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis',
+ 350: 'ibex, Capra ibex',
+ 351: 'hartebeest',
+ 352: 'impala, Aepyceros melampus',
+ 353: 'gazelle',
+ 354: 'Arabian camel, dromedary, Camelus dromedarius',
+ 355: 'llama',
+ 356: 'weasel',
+ 357: 'mink',
+ 358: 'polecat, fitch, foulmart, foumart, Mustela putorius',
+ 359: 'black-footed ferret, ferret, Mustela nigripes',
+ 360: 'otter',
+ 361: 'skunk, polecat, wood pussy',
+ 362: 'badger',
+ 363: 'armadillo',
+ 364: 'three-toed sloth, ai, Bradypus tridactylus',
+ 365: 'orangutan, orang, orangutang, Pongo pygmaeus',
+ 366: 'gorilla, Gorilla gorilla',
+ 367: 'chimpanzee, chimp, Pan troglodytes',
+ 368: 'gibbon, Hylobates lar',
+ 369: 'siamang, Hylobates syndactylus, Symphalangus syndactylus',
+ 370: 'guenon, guenon monkey',
+ 371: 'patas, hussar monkey, Erythrocebus patas',
+ 372: 'baboon',
+ 373: 'macaque',
+ 374: 'langur',
+ 375: 'colobus, colobus monkey',
+ 376: 'proboscis monkey, Nasalis larvatus',
+ 377: 'marmoset',
+ 378: 'capuchin, ringtail, Cebus capucinus',
+ 379: 'howler monkey, howler',
+ 380: 'titi, titi monkey',
+ 381: 'spider monkey, Ateles geoffroyi',
+ 382: 'squirrel monkey, Saimiri sciureus',
+ 383: 'Madagascar cat, ring-tailed lemur, Lemur catta',
+ 384: 'indri, indris, Indri indri, Indri brevicaudatus',
+ 385: 'Indian elephant, Elephas maximus',
+ 386: 'African elephant, Loxodonta africana',
+ 387: 'lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens',
+ 388: 'giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca',
+ 389: 'barracouta, snoek',
+ 390: 'eel',
+ 391: 'coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch',
+ 392: 'rock beauty, Holocanthus tricolor',
+ 393: 'anemone fish',
+ 394: 'sturgeon',
+ 395: 'gar, garfish, garpike, billfish, Lepisosteus osseus',
+ 396: 'lionfish',
+ 397: 'puffer, pufferfish, blowfish, globefish',
+ 398: 'abacus',
+ 399: 'abaya',
+ 400: "academic gown, academic robe, judge's robe",
+ 401: 'accordion, piano accordion, squeeze box',
+ 402: 'acoustic guitar',
+ 403: 'aircraft carrier, carrier, flattop, attack aircraft carrier',
+ 404: 'airliner',
+ 405: 'airship, dirigible',
+ 406: 'altar',
+ 407: 'ambulance',
+ 408: 'amphibian, amphibious vehicle',
+ 409: 'analog clock',
+ 410: 'apiary, bee house',
+ 411: 'apron',
+ 412: 'ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin',
+ 413: 'assault rifle, assault gun',
+ 414: 'backpack, back pack, knapsack, packsack, rucksack, haversack',
+ 415: 'bakery, bakeshop, bakehouse',
+ 416: 'balance beam, beam',
+ 417: 'balloon',
+ 418: 'ballpoint, ballpoint pen, ballpen, Biro',
+ 419: 'Band Aid',
+ 420: 'banjo',
+ 421: 'bannister, banister, balustrade, balusters, handrail',
+ 422: 'barbell',
+ 423: 'barber chair',
+ 424: 'barbershop',
+ 425: 'barn',
+ 426: 'barometer',
+ 427: 'barrel, cask',
+ 428: 'barrow, garden cart, lawn cart, wheelbarrow',
+ 429: 'baseball',
+ 430: 'basketball',
+ 431: 'bassinet',
+ 432: 'bassoon',
+ 433: 'bathing cap, swimming cap',
+ 434: 'bath towel',
+ 435: 'bathtub, bathing tub, bath, tub',
+ 436: 'beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon',
+ 437: 'beacon, lighthouse, beacon light, pharos',
+ 438: 'beaker',
+ 439: 'bearskin, busby, shako',
+ 440: 'beer bottle',
+ 441: 'beer glass',
+ 442: 'bell cote, bell cot',
+ 443: 'bib',
+ 444: 'bicycle-built-for-two, tandem bicycle, tandem',
+ 445: 'bikini, two-piece',
+ 446: 'binder, ring-binder',
+ 447: 'binoculars, field glasses, opera glasses',
+ 448: 'birdhouse',
+ 449: 'boathouse',
+ 450: 'bobsled, bobsleigh, bob',
+ 451: 'bolo tie, bolo, bola tie, bola',
+ 452: 'bonnet, poke bonnet',
+ 453: 'bookcase',
+ 454: 'bookshop, bookstore, bookstall',
+ 455: 'bottlecap',
+ 456: 'bow',
+ 457: 'bow tie, bow-tie, bowtie',
+ 458: 'brass, memorial tablet, plaque',
+ 459: 'brassiere, bra, bandeau',
+ 460: 'breakwater, groin, groyne, mole, bulwark, seawall, jetty',
+ 461: 'breastplate, aegis, egis',
+ 462: 'broom',
+ 463: 'bucket, pail',
+ 464: 'buckle',
+ 465: 'bulletproof vest',
+ 466: 'bullet train, bullet',
+ 467: 'butcher shop, meat market',
+ 468: 'cab, hack, taxi, taxicab',
+ 469: 'caldron, cauldron',
+ 470: 'candle, taper, wax light',
+ 471: 'cannon',
+ 472: 'canoe',
+ 473: 'can opener, tin opener',
+ 474: 'cardigan',
+ 475: 'car mirror',
+ 476: 'carousel, carrousel, merry-go-round, roundabout, whirligig',
+ 477: "carpenter's kit, tool kit",
+ 478: 'carton',
+ 479: 'car wheel',
+ 480: 'cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM',
+ 481: 'cassette',
+ 482: 'cassette player',
+ 483: 'castle',
+ 484: 'catamaran',
+ 485: 'CD player',
+ 486: 'cello, violoncello',
+ 487: 'cellular telephone, cellular phone, cellphone, cell, mobile phone',
+ 488: 'chain',
+ 489: 'chainlink fence',
+ 490: 'chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour',
+ 491: 'chain saw, chainsaw',
+ 492: 'chest',
+ 493: 'chiffonier, commode',
+ 494: 'chime, bell, gong',
+ 495: 'china cabinet, china closet',
+ 496: 'Christmas stocking',
+ 497: 'church, church building',
+ 498: 'cinema, movie theater, movie theatre, movie house, picture palace',
+ 499: 'cleaver, meat cleaver, chopper',
+ 500: 'cliff dwelling',
+ 501: 'cloak',
+ 502: 'clog, geta, patten, sabot',
+ 503: 'cocktail shaker',
+ 504: 'coffee mug',
+ 505: 'coffeepot',
+ 506: 'coil, spiral, volute, whorl, helix',
+ 507: 'combination lock',
+ 508: 'computer keyboard, keypad',
+ 509: 'confectionery, confectionary, candy store',
+ 510: 'container ship, containership, container vessel',
+ 511: 'convertible',
+ 512: 'corkscrew, bottle screw',
+ 513: 'cornet, horn, trumpet, trump',
+ 514: 'cowboy boot',
+ 515: 'cowboy hat, ten-gallon hat',
+ 516: 'cradle',
+ 517: 'crane',
+ 518: 'crash helmet',
+ 519: 'crate',
+ 520: 'crib, cot',
+ 521: 'Crock Pot',
+ 522: 'croquet ball',
+ 523: 'crutch',
+ 524: 'cuirass',
+ 525: 'dam, dike, dyke',
+ 526: 'desk',
+ 527: 'desktop computer',
+ 528: 'dial telephone, dial phone',
+ 529: 'diaper, nappy, napkin',
+ 530: 'digital clock',
+ 531: 'digital watch',
+ 532: 'dining table, board',
+ 533: 'dishrag, dishcloth',
+ 534: 'dishwasher, dish washer, dishwashing machine',
+ 535: 'disk brake, disc brake',
+ 536: 'dock, dockage, docking facility',
+ 537: 'dogsled, dog sled, dog sleigh',
+ 538: 'dome',
+ 539: 'doormat, welcome mat',
+ 540: 'drilling platform, offshore rig',
+ 541: 'drum, membranophone, tympan',
+ 542: 'drumstick',
+ 543: 'dumbbell',
+ 544: 'Dutch oven',
+ 545: 'electric fan, blower',
+ 546: 'electric guitar',
+ 547: 'electric locomotive',
+ 548: 'entertainment center',
+ 549: 'envelope',
+ 550: 'espresso maker',
+ 551: 'face powder',
+ 552: 'feather boa, boa',
+ 553: 'file, file cabinet, filing cabinet',
+ 554: 'fireboat',
+ 555: 'fire engine, fire truck',
+ 556: 'fire screen, fireguard',
+ 557: 'flagpole, flagstaff',
+ 558: 'flute, transverse flute',
+ 559: 'folding chair',
+ 560: 'football helmet',
+ 561: 'forklift',
+ 562: 'fountain',
+ 563: 'fountain pen',
+ 564: 'four-poster',
+ 565: 'freight car',
+ 566: 'French horn, horn',
+ 567: 'frying pan, frypan, skillet',
+ 568: 'fur coat',
+ 569: 'garbage truck, dustcart',
+ 570: 'gasmask, respirator, gas helmet',
+ 571: 'gas pump, gasoline pump, petrol pump, island dispenser',
+ 572: 'goblet',
+ 573: 'go-kart',
+ 574: 'golf ball',
+ 575: 'golfcart, golf cart',
+ 576: 'gondola',
+ 577: 'gong, tam-tam',
+ 578: 'gown',
+ 579: 'grand piano, grand',
+ 580: 'greenhouse, nursery, glasshouse',
+ 581: 'grille, radiator grille',
+ 582: 'grocery store, grocery, food market, market',
+ 583: 'guillotine',
+ 584: 'hair slide',
+ 585: 'hair spray',
+ 586: 'half track',
+ 587: 'hammer',
+ 588: 'hamper',
+ 589: 'hand blower, blow dryer, blow drier, hair dryer, hair drier',
+ 590: 'hand-held computer, hand-held microcomputer',
+ 591: 'handkerchief, hankie, hanky, hankey',
+ 592: 'hard disc, hard disk, fixed disk',
+ 593: 'harmonica, mouth organ, harp, mouth harp',
+ 594: 'harp',
+ 595: 'harvester, reaper',
+ 596: 'hatchet',
+ 597: 'holster',
+ 598: 'home theater, home theatre',
+ 599: 'honeycomb',
+ 600: 'hook, claw',
+ 601: 'hoopskirt, crinoline',
+ 602: 'horizontal bar, high bar',
+ 603: 'horse cart, horse-cart',
+ 604: 'hourglass',
+ 605: 'iPod',
+ 606: 'iron, smoothing iron',
+ 607: "jack-o'-lantern",
+ 608: 'jean, blue jean, denim',
+ 609: 'jeep, landrover',
+ 610: 'jersey, T-shirt, tee shirt',
+ 611: 'jigsaw puzzle',
+ 612: 'jinrikisha, ricksha, rickshaw',
+ 613: 'joystick',
+ 614: 'kimono',
+ 615: 'knee pad',
+ 616: 'knot',
+ 617: 'lab coat, laboratory coat',
+ 618: 'ladle',
+ 619: 'lampshade, lamp shade',
+ 620: 'laptop, laptop computer',
+ 621: 'lawn mower, mower',
+ 622: 'lens cap, lens cover',
+ 623: 'letter opener, paper knife, paperknife',
+ 624: 'library',
+ 625: 'lifeboat',
+ 626: 'lighter, light, igniter, ignitor',
+ 627: 'limousine, limo',
+ 628: 'liner, ocean liner',
+ 629: 'lipstick, lip rouge',
+ 630: 'Loafer',
+ 631: 'lotion',
+ 632: 'loudspeaker, speaker, speaker unit, loudspeaker system, speaker system',
+ 633: "loupe, jeweler's loupe",
+ 634: 'lumbermill, sawmill',
+ 635: 'magnetic compass',
+ 636: 'mailbag, postbag',
+ 637: 'mailbox, letter box',
+ 638: 'maillot',
+ 639: 'maillot, tank suit',
+ 640: 'manhole cover',
+ 641: 'maraca',
+ 642: 'marimba, xylophone',
+ 643: 'mask',
+ 644: 'matchstick',
+ 645: 'maypole',
+ 646: 'maze, labyrinth',
+ 647: 'measuring cup',
+ 648: 'medicine chest, medicine cabinet',
+ 649: 'megalith, megalithic structure',
+ 650: 'microphone, mike',
+ 651: 'microwave, microwave oven',
+ 652: 'military uniform',
+ 653: 'milk can',
+ 654: 'minibus',
+ 655: 'miniskirt, mini',
+ 656: 'minivan',
+ 657: 'missile',
+ 658: 'mitten',
+ 659: 'mixing bowl',
+ 660: 'mobile home, manufactured home',
+ 661: 'Model T',
+ 662: 'modem',
+ 663: 'monastery',
+ 664: 'monitor',
+ 665: 'moped',
+ 666: 'mortar',
+ 667: 'mortarboard',
+ 668: 'mosque',
+ 669: 'mosquito net',
+ 670: 'motor scooter, scooter',
+ 671: 'mountain bike, all-terrain bike, off-roader',
+ 672: 'mountain tent',
+ 673: 'mouse, computer mouse',
+ 674: 'mousetrap',
+ 675: 'moving van',
+ 676: 'muzzle',
+ 677: 'nail',
+ 678: 'neck brace',
+ 679: 'necklace',
+ 680: 'nipple',
+ 681: 'notebook, notebook computer',
+ 682: 'obelisk',
+ 683: 'oboe, hautboy, hautbois',
+ 684: 'ocarina, sweet potato',
+ 685: 'odometer, hodometer, mileometer, milometer',
+ 686: 'oil filter',
+ 687: 'organ, pipe organ',
+ 688: 'oscilloscope, scope, cathode-ray oscilloscope, CRO',
+ 689: 'overskirt',
+ 690: 'oxcart',
+ 691: 'oxygen mask',
+ 692: 'packet',
+ 693: 'paddle, boat paddle',
+ 694: 'paddlewheel, paddle wheel',
+ 695: 'padlock',
+ 696: 'paintbrush',
+ 697: "pajama, pyjama, pj's, jammies",
+ 698: 'palace',
+ 699: 'panpipe, pandean pipe, syrinx',
+ 700: 'paper towel',
+ 701: 'parachute, chute',
+ 702: 'parallel bars, bars',
+ 703: 'park bench',
+ 704: 'parking meter',
+ 705: 'passenger car, coach, carriage',
+ 706: 'patio, terrace',
+ 707: 'pay-phone, pay-station',
+ 708: 'pedestal, plinth, footstall',
+ 709: 'pencil box, pencil case',
+ 710: 'pencil sharpener',
+ 711: 'perfume, essence',
+ 712: 'Petri dish',
+ 713: 'photocopier',
+ 714: 'pick, plectrum, plectron',
+ 715: 'pickelhaube',
+ 716: 'picket fence, paling',
+ 717: 'pickup, pickup truck',
+ 718: 'pier',
+ 719: 'piggy bank, penny bank',
+ 720: 'pill bottle',
+ 721: 'pillow',
+ 722: 'ping-pong ball',
+ 723: 'pinwheel',
+ 724: 'pirate, pirate ship',
+ 725: 'pitcher, ewer',
+ 726: "plane, carpenter's plane, woodworking plane",
+ 727: 'planetarium',
+ 728: 'plastic bag',
+ 729: 'plate rack',
+ 730: 'plow, plough',
+ 731: "plunger, plumber's helper",
+ 732: 'Polaroid camera, Polaroid Land camera',
+ 733: 'pole',
+ 734: 'police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria',
+ 735: 'poncho',
+ 736: 'pool table, billiard table, snooker table',
+ 737: 'pop bottle, soda bottle',
+ 738: 'pot, flowerpot',
+ 739: "potter's wheel",
+ 740: 'power drill',
+ 741: 'prayer rug, prayer mat',
+ 742: 'printer',
+ 743: 'prison, prison house',
+ 744: 'projectile, missile',
+ 745: 'projector',
+ 746: 'puck, hockey puck',
+ 747: 'punching bag, punch bag, punching ball, punchball',
+ 748: 'purse',
+ 749: 'quill, quill pen',
+ 750: 'quilt, comforter, comfort, puff',
+ 751: 'racer, race car, racing car',
+ 752: 'racket, racquet',
+ 753: 'radiator',
+ 754: 'radio, wireless',
+ 755: 'radio telescope, radio reflector',
+ 756: 'rain barrel',
+ 757: 'recreational vehicle, RV, R.V.',
+ 758: 'reel',
+ 759: 'reflex camera',
+ 760: 'refrigerator, icebox',
+ 761: 'remote control, remote',
+ 762: 'restaurant, eating house, eating place, eatery',
+ 763: 'revolver, six-gun, six-shooter',
+ 764: 'rifle',
+ 765: 'rocking chair, rocker',
+ 766: 'rotisserie',
+ 767: 'rubber eraser, rubber, pencil eraser',
+ 768: 'rugby ball',
+ 769: 'rule, ruler',
+ 770: 'running shoe',
+ 771: 'safe',
+ 772: 'safety pin',
+ 773: 'saltshaker, salt shaker',
+ 774: 'sandal',
+ 775: 'sarong',
+ 776: 'sax, saxophone',
+ 777: 'scabbard',
+ 778: 'scale, weighing machine',
+ 779: 'school bus',
+ 780: 'schooner',
+ 781: 'scoreboard',
+ 782: 'screen, CRT screen',
+ 783: 'screw',
+ 784: 'screwdriver',
+ 785: 'seat belt, seatbelt',
+ 786: 'sewing machine',
+ 787: 'shield, buckler',
+ 788: 'shoe shop, shoe-shop, shoe store',
+ 789: 'shoji',
+ 790: 'shopping basket',
+ 791: 'shopping cart',
+ 792: 'shovel',
+ 793: 'shower cap',
+ 794: 'shower curtain',
+ 795: 'ski',
+ 796: 'ski mask',
+ 797: 'sleeping bag',
+ 798: 'slide rule, slipstick',
+ 799: 'sliding door',
+ 800: 'slot, one-armed bandit',
+ 801: 'snorkel',
+ 802: 'snowmobile',
+ 803: 'snowplow, snowplough',
+ 804: 'soap dispenser',
+ 805: 'soccer ball',
+ 806: 'sock',
+ 807: 'solar dish, solar collector, solar furnace',
+ 808: 'sombrero',
+ 809: 'soup bowl',
+ 810: 'space bar',
+ 811: 'space heater',
+ 812: 'space shuttle',
+ 813: 'spatula',
+ 814: 'speedboat',
+ 815: "spider web, spider's web",
+ 816: 'spindle',
+ 817: 'sports car, sport car',
+ 818: 'spotlight, spot',
+ 819: 'stage',
+ 820: 'steam locomotive',
+ 821: 'steel arch bridge',
+ 822: 'steel drum',
+ 823: 'stethoscope',
+ 824: 'stole',
+ 825: 'stone wall',
+ 826: 'stopwatch, stop watch',
+ 827: 'stove',
+ 828: 'strainer',
+ 829: 'streetcar, tram, tramcar, trolley, trolley car',
+ 830: 'stretcher',
+ 831: 'studio couch, day bed',
+ 832: 'stupa, tope',
+ 833: 'submarine, pigboat, sub, U-boat',
+ 834: 'suit, suit of clothes',
+ 835: 'sundial',
+ 836: 'sunglass',
+ 837: 'sunglasses, dark glasses, shades',
+ 838: 'sunscreen, sunblock, sun blocker',
+ 839: 'suspension bridge',
+ 840: 'swab, swob, mop',
+ 841: 'sweatshirt',
+ 842: 'swimming trunks, bathing trunks',
+ 843: 'swing',
+ 844: 'switch, electric switch, electrical switch',
+ 845: 'syringe',
+ 846: 'table lamp',
+ 847: 'tank, army tank, armored combat vehicle, armoured combat vehicle',
+ 848: 'tape player',
+ 849: 'teapot',
+ 850: 'teddy, teddy bear',
+ 851: 'television, television system',
+ 852: 'tennis ball',
+ 853: 'thatch, thatched roof',
+ 854: 'theater curtain, theatre curtain',
+ 855: 'thimble',
+ 856: 'thresher, thrasher, threshing machine',
+ 857: 'throne',
+ 858: 'tile roof',
+ 859: 'toaster',
+ 860: 'tobacco shop, tobacconist shop, tobacconist',
+ 861: 'toilet seat',
+ 862: 'torch',
+ 863: 'totem pole',
+ 864: 'tow truck, tow car, wrecker',
+ 865: 'toyshop',
+ 866: 'tractor',
+ 867: 'trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi',
+ 868: 'tray',
+ 869: 'trench coat',
+ 870: 'tricycle, trike, velocipede',
+ 871: 'trimaran',
+ 872: 'tripod',
+ 873: 'triumphal arch',
+ 874: 'trolleybus, trolley coach, trackless trolley',
+ 875: 'trombone',
+ 876: 'tub, vat',
+ 877: 'turnstile',
+ 878: 'typewriter keyboard',
+ 879: 'umbrella',
+ 880: 'unicycle, monocycle',
+ 881: 'upright, upright piano',
+ 882: 'vacuum, vacuum cleaner',
+ 883: 'vase',
+ 884: 'vault',
+ 885: 'velvet',
+ 886: 'vending machine',
+ 887: 'vestment',
+ 888: 'viaduct',
+ 889: 'violin, fiddle',
+ 890: 'volleyball',
+ 891: 'waffle iron',
+ 892: 'wall clock',
+ 893: 'wallet, billfold, notecase, pocketbook',
+ 894: 'wardrobe, closet, press',
+ 895: 'warplane, military plane',
+ 896: 'washbasin, handbasin, washbowl, lavabo, wash-hand basin',
+ 897: 'washer, automatic washer, washing machine',
+ 898: 'water bottle',
+ 899: 'water jug',
+ 900: 'water tower',
+ 901: 'whiskey jug',
+ 902: 'whistle',
+ 903: 'wig',
+ 904: 'window screen',
+ 905: 'window shade',
+ 906: 'Windsor tie',
+ 907: 'wine bottle',
+ 908: 'wing',
+ 909: 'wok',
+ 910: 'wooden spoon',
+ 911: 'wool, woolen, woollen',
+ 912: 'worm fence, snake fence, snake-rail fence, Virginia fence',
+ 913: 'wreck',
+ 914: 'yawl',
+ 915: 'yurt',
+ 916: 'web site, website, internet site, site',
+ 917: 'comic book',
+ 918: 'crossword puzzle, crossword',
+ 919: 'street sign',
+ 920: 'traffic light, traffic signal, stoplight',
+ 921: 'book jacket, dust cover, dust jacket, dust wrapper',
+ 922: 'menu',
+ 923: 'plate',
+ 924: 'guacamole',
+ 925: 'consomme',
+ 926: 'hot pot, hotpot',
+ 927: 'trifle',
+ 928: 'ice cream, icecream',
+ 929: 'ice lolly, lolly, lollipop, popsicle',
+ 930: 'French loaf',
+ 931: 'bagel, beigel',
+ 932: 'pretzel',
+ 933: 'cheeseburger',
+ 934: 'hotdog, hot dog, red hot',
+ 935: 'mashed potato',
+ 936: 'head cabbage',
+ 937: 'broccoli',
+ 938: 'cauliflower',
+ 939: 'zucchini, courgette',
+ 940: 'spaghetti squash',
+ 941: 'acorn squash',
+ 942: 'butternut squash',
+ 943: 'cucumber, cuke',
+ 944: 'artichoke, globe artichoke',
+ 945: 'bell pepper',
+ 946: 'cardoon',
+ 947: 'mushroom',
+ 948: 'Granny Smith',
+ 949: 'strawberry',
+ 950: 'orange',
+ 951: 'lemon',
+ 952: 'fig',
+ 953: 'pineapple, ananas',
+ 954: 'banana',
+ 955: 'jackfruit, jak, jack',
+ 956: 'custard apple',
+ 957: 'pomegranate',
+ 958: 'hay',
+ 959: 'carbonara',
+ 960: 'chocolate sauce, chocolate syrup',
+ 961: 'dough',
+ 962: 'meat loaf, meatloaf',
+ 963: 'pizza, pizza pie',
+ 964: 'potpie',
+ 965: 'burrito',
+ 966: 'red wine',
+ 967: 'espresso',
+ 968: 'cup',
+ 969: 'eggnog',
+ 970: 'alp',
+ 971: 'bubble',
+ 972: 'cliff, drop, drop-off',
+ 973: 'coral reef',
+ 974: 'geyser',
+ 975: 'lakeside, lakeshore',
+ 976: 'promontory, headland, head, foreland',
+ 977: 'sandbar, sand bar',
+ 978: 'seashore, coast, seacoast, sea-coast',
+ 979: 'valley, vale',
+ 980: 'volcano',
+ 981: 'ballplayer, baseball player',
+ 982: 'groom, bridegroom',
+ 983: 'scuba diver',
+ 984: 'rapeseed',
+ 985: 'daisy',
+ 986: "yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum",
+ 987: 'corn',
+ 988: 'acorn',
+ 989: 'hip, rose hip, rosehip',
+ 990: 'buckeye, horse chestnut, conker',
+ 991: 'coral fungus',
+ 992: 'agaric',
+ 993: 'gyromitra',
+ 994: 'stinkhorn, carrion fungus',
+ 995: 'earthstar',
+ 996: 'hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa',
+ 997: 'bolete',
+ 998: 'ear, spike, capitulum',
+ 999: 'toilet tissue, toilet paper, bathroom tissue'}

modeling/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+"""
+Copyright (2024) Bytedance Ltd. and/or its affiliates
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""

modeling/blocks.py ADDED Viewed

	@@ -0,0 +1,224 @@

+"""Building blocks for TiTok.
+Copyright (2024) Bytedance Ltd. and/or its affiliates
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+Reference:
+    https://github.com/mlfoundations/open_clip/blob/main/src/open_clip/transformer.py
+"""
+import torch
+import torch.nn as nn
+from collections import OrderedDict
+class ResidualAttentionBlock(nn.Module):
+    def __init__(
+            self,
+            d_model,
+            n_head,
+            mlp_ratio = 4.0,
+            act_layer = nn.GELU,
+            norm_layer = nn.LayerNorm
+        ):
+        super().__init__()
+        self.ln_1 = norm_layer(d_model)
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.mlp_ratio = mlp_ratio
+        # optionally we can disable the FFN
+        if mlp_ratio > 0:
+            self.ln_2 = norm_layer(d_model)
+            mlp_width = int(d_model * mlp_ratio)
+            self.mlp = nn.Sequential(OrderedDict([
+                ("c_fc", nn.Linear(d_model, mlp_width)),
+                ("gelu", act_layer()),
+                ("c_proj", nn.Linear(mlp_width, d_model))
+            ]))
+    def attention(
+            self,
+            x: torch.Tensor
+    ):
+        return self.attn(x, x, x, need_weights=False)[0]
+    def forward(
+            self,
+            x: torch.Tensor,
+    ):
+        attn_output = self.attention(x=self.ln_1(x))
+        x = x + attn_output
+        if self.mlp_ratio > 0:
+            x = x + self.mlp(self.ln_2(x))
+        return x
+def _expand_token(token, batch_size: int):
+    return token.unsqueeze(0).expand(batch_size, -1, -1)
+class TiTokEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.image_size = config.dataset.preprocessing.crop_size
+        self.patch_size = config.model.vq_model.vit_enc_patch_size
+        self.grid_size = self.image_size // self.patch_size
+        self.model_size = config.model.vq_model.vit_enc_model_size
+        self.num_latent_tokens = config.model.vq_model.num_latent_tokens
+        self.token_size = config.model.vq_model.token_size
+        self.width = {
+                "small": 512,
+                "base": 768,
+                "large": 1024,
+            }[self.model_size]
+        self.num_layers = {
+                "small": 8,
+                "base": 12,
+                "large": 24,
+            }[self.model_size]
+        self.num_heads = {
+                "small": 8,
+                "base": 12,
+                "large": 16,
+            }[self.model_size]
+        self.patch_embed = nn.Conv2d(
+            in_channels=3, out_channels=self.width,
+              kernel_size=self.patch_size, stride=self.patch_size, bias=True)
+        scale = self.width ** -0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(1, self.width))
+        self.positional_embedding = nn.Parameter(
+                scale * torch.randn(self.grid_size ** 2 + 1, self.width))
+        self.latent_token_positional_embedding = nn.Parameter(
+            scale * torch.randn(self.num_latent_tokens, self.width))
+        self.ln_pre = nn.LayerNorm(self.width)
+        self.transformer = nn.ModuleList()
+        for i in range(self.num_layers):
+            self.transformer.append(ResidualAttentionBlock(
+                self.width, self.num_heads, mlp_ratio=4.0
+            ))
+        self.ln_post = nn.LayerNorm(self.width)
+        self.conv_out = nn.Conv2d(self.width, self.token_size, kernel_size=1, bias=True)
+    def forward(self, pixel_values, latent_tokens):
+        batch_size = pixel_values.shape[0]
+        x = pixel_values
+        x = self.patch_embed(x)
+        x = x.reshape(x.shape[0], x.shape[1], -1)
+        x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width]
+        # class embeddings and positional embeddings
+        x = torch.cat([_expand_token(self.class_embedding, x.shape[0]).to(x.dtype), x], dim=1)
+        x = x + self.positional_embedding.to(x.dtype) # shape = [*, grid ** 2 + 1, width]
+        latent_tokens = _expand_token(latent_tokens, x.shape[0]).to(x.dtype)
+        latent_tokens = latent_tokens + self.latent_token_positional_embedding.to(x.dtype)
+        x = torch.cat([x, latent_tokens], dim=1)
+        x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        for i in range(self.num_layers):
+            x = self.transformer[i](x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        latent_tokens = x[:, 1+self.grid_size**2:]
+        latent_tokens = self.ln_post(latent_tokens)
+        # fake 2D shape
+        latent_tokens = latent_tokens.reshape(batch_size, self.width, self.num_latent_tokens, 1)
+        latent_tokens = self.conv_out(latent_tokens)
+        latent_tokens = latent_tokens.reshape(batch_size, self.token_size, 1, self.num_latent_tokens)
+        return latent_tokens
+class TiTokDecoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.image_size = config.dataset.preprocessing.crop_size
+        self.patch_size = config.model.vq_model.vit_dec_patch_size
+        self.grid_size = self.image_size // self.patch_size
+        self.model_size = config.model.vq_model.vit_dec_model_size
+        self.num_latent_tokens = config.model.vq_model.num_latent_tokens
+        self.token_size = config.model.vq_model.token_size
+        self.width = {
+                "small": 512,
+                "base": 768,
+                "large": 1024,
+            }[self.model_size]
+        self.num_layers = {
+                "small": 8,
+                "base": 12,
+                "large": 24,
+            }[self.model_size]
+        self.num_heads = {
+                "small": 8,
+                "base": 12,
+                "large": 16,
+            }[self.model_size]
+        self.decoder_embed = nn.Linear(
+            self.token_size, self.width, bias=True)
+        scale = self.width ** -0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(1, self.width))
+        self.positional_embedding = nn.Parameter(
+                scale * torch.randn(self.grid_size ** 2 + 1, self.width))
+        # add mask token and query pos embed
+        self.mask_token = nn.Parameter(scale * torch.randn(1, 1, self.width))
+        self.latent_token_positional_embedding = nn.Parameter(
+            scale * torch.randn(self.num_latent_tokens, self.width))
+        self.ln_pre = nn.LayerNorm(self.width)
+        self.transformer = nn.ModuleList()
+        for i in range(self.num_layers):
+            self.transformer.append(ResidualAttentionBlock(
+                self.width, self.num_heads, mlp_ratio=4.0
+            ))
+        self.ln_post = nn.LayerNorm(self.width)
+        self.ffn = nn.Sequential(
+            nn.Conv2d(self.width, 2 * self.width, 1, padding=0, bias=True),
+            nn.Tanh(),
+            nn.Conv2d(2 * self.width, 1024, 1, padding=0, bias=True),
+        )
+        self.conv_out = nn.Identity()
+    def forward(self, z_quantized):
+        N, C, H, W = z_quantized.shape
+        assert H == 1 and W == self.num_latent_tokens, f"{H}, {W}, {self.num_latent_tokens}"
+        x = z_quantized.reshape(N, C*H, W).permute(0, 2, 1) # NLD
+        x = self.decoder_embed(x)
+        batchsize, seq_len, _ = x.shape
+        mask_tokens = self.mask_token.repeat(batchsize, self.grid_size**2, 1).to(x.dtype)
+        mask_tokens = torch.cat([_expand_token(self.class_embedding, mask_tokens.shape[0]).to(mask_tokens.dtype),
+                                    mask_tokens], dim=1)
+        mask_tokens = mask_tokens + self.positional_embedding.to(mask_tokens.dtype)
+        x = x + self.latent_token_positional_embedding[:seq_len]
+        x = torch.cat([mask_tokens, x], dim=1)
+        x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        for i in range(self.num_layers):
+            x = self.transformer[i](x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = x[:, 1:1+self.grid_size**2] # remove cls embed
+        x = self.ln_post(x)
+        # N L D -> N D H W
+        x = x.permute(0, 2, 1).reshape(batchsize, self.width, self.grid_size, self.grid_size)
+        x = self.ffn(x.contiguous())
+        x = self.conv_out(x)
+        return x

modeling/maskgit.py ADDED Viewed

	@@ -0,0 +1,138 @@

+"""This file contains implementation for MaskGIT model.
+Copyright (2024) Bytedance Ltd. and/or its affiliates
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+Reference:
+    https://github.com/huggingface/open-muse
+    https://github.com/baaivision/MUSE-Pytorch
+"""
+import torch
+from torch import nn
+import numpy as np
+import math
+import torch.utils.checkpoint
+from transformers import BertConfig, BertModel
+class ImageBert(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.target_codebook_size = config.model.vq_model.codebook_size
+        self.condition_num_classes = config.model.generator.condition_num_classes
+        self.image_seq_len = config.model.generator.image_seq_len
+        self.mask_token_id = self.target_codebook_size
+        self.model = BertModel(BertConfig(
+            vocab_size=self.target_codebook_size + self.condition_num_classes + 2,
+            hidden_size=768,
+            num_hidden_layers=24,
+            num_attention_heads=16,
+            intermediate_size=3072,
+            hidden_act='gelu',
+            hidden_dropout_prob=config.model.generator.dropout,
+            attention_probs_dropout_prob=config.model.generator.attn_drop,
+            max_position_embeddings=config.model.generator.image_seq_len + 1,
+            initializer_range=0.02,
+            layer_norm_eps=1e-12,
+            pad_token_id=None,
+            position_embedding_type="absolute",
+            use_cache=True
+        ), add_pooling_layer=False)
+        self.model.lm_head = nn.Linear(768, self.target_codebook_size, bias=True)
+        self.model.post_init()
+    def forward(self, input_ids=None, condition=None, cond_drop_prob=0.1):
+        # Token space:
+        #  [0, codebook_size - 1]                       : those are the learned quantized image tokens
+        #  codebook_size                                : the mask token used to mask image tokens
+        #  [codebook_size + 1, codebook_size + nclass]  : the imagenet class tokens
+        #  codebook_size + 1 + nclass                   : the class drop label
+        drop_label_mask = torch.rand_like(condition, dtype=torch.float) < cond_drop_prob
+        # Shift the classes
+        condition = condition + self.target_codebook_size + 1  # [0, 999] -> [codebook_size + 1, codebook_size + 999]
+        condition[drop_label_mask] = self.condition_num_classes + self.target_codebook_size + 1
+        # prepend condition token
+        if input_ids is not None:
+            input_ids = torch.cat([condition.view(condition.shape[0], -1),
+                                   input_ids.view(input_ids.shape[0], -1),], dim=1)
+        else:
+            # at least there should be masked token
+            raise NotImplementedError
+        model_output = self.model(input_ids=input_ids)
+        model_output = model_output[0]
+        return self.model.lm_head(model_output[:, 1:]) # remove cond
+    # ref: https://github.com/baaivision/MUSE-Pytorch/blob/master/libs/muse.py#L40
+    @torch.no_grad()
+    def generate(self,
+                 condition,
+                 guidance_scale=3.0,
+                 randomize_temperature=4.5,
+                 num_sample_steps=8):
+        device = condition.device
+        ids = torch.full((condition.shape[0], self.image_seq_len),
+                          self.mask_token_id, device=device)
+        cfg_scale =  guidance_scale
+        for step in range(num_sample_steps):
+            ratio = 1. * (step + 1) / num_sample_steps
+            annealed_temp = randomize_temperature * (1.0 - ratio)
+            is_mask = (ids == self.mask_token_id)
+            if cfg_scale != 0:
+                cond_logits = self.forward(
+                    ids, condition, cond_drop_prob=0.0
+                )
+                uncond_logits = self.forward(
+                    ids, condition, cond_drop_prob=1.0
+                )
+                logits = cond_logits + (cond_logits - uncond_logits) * cfg_scale
+            else:
+                logits = self.forward(
+                    ids, condition, cond_drop_prob=0.0
+                )
+            # Add gumbel noise
+            def log(t, eps=1e-20):
+                return torch.log(t.clamp(min=eps))
+            def gumbel_noise(t):
+                noise = torch.zeros_like(t).uniform_(0, 1)
+                return -log(-log(noise))
+            def add_gumbel_noise(t, temperature):
+                return t + temperature * gumbel_noise(t)
+            sampled_ids = add_gumbel_noise(logits, annealed_temp).argmax(dim=-1)
+            sampled_logits = torch.squeeze(
+                torch.gather(logits, dim=-1, index=torch.unsqueeze(sampled_ids, -1)), -1)
+            sampled_ids = torch.where(is_mask, sampled_ids, ids)
+            sampled_logits = torch.where(is_mask, sampled_logits, +np.inf).float()
+            # masking
+            mask_ratio = np.arccos(ratio) / (math.pi * 0.5)
+            mask_len = torch.Tensor([np.floor(self.image_seq_len * mask_ratio)]).to(device)
+            mask_len = torch.maximum(torch.Tensor([1]).to(device),
+                                     torch.minimum(torch.sum(is_mask, dim=-1, keepdims=True) - 1,
+                                                   mask_len))[0].squeeze()
+            confidence = add_gumbel_noise(sampled_logits, annealed_temp)
+            sorted_confidence, _ = torch.sort(confidence, axis=-1)
+            cut_off = sorted_confidence[:, mask_len.long() - 1:mask_len.long()]
+            masking = (confidence <= cut_off)
+            if step == num_sample_steps - 1:
+                ids = sampled_ids
+            else:
+                ids = torch.where(masking, self.mask_token_id, sampled_ids)
+        return ids

modeling/maskgit_vqgan.py ADDED Viewed

	@@ -0,0 +1,362 @@

+"""This file contains code for MaskGIT-VQGAN.
+This file may have been modified by Bytedance Ltd. and/or its affiliates (“Bytedance's Modifications”).
+All Bytedance's Modifications are Copyright (year) Bytedance Ltd. and/or its affiliates.
+Reference:
+    https://github.com/huggingface/open-muse/blob/main/muse/modeling_maskgit_vqgan.py
+"""
+# Copyright 2023 Google LLC and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+r"""MaskGIT Tokenizer based on VQGAN.
+This tokenizer is a reimplementation of VQGAN [https://arxiv.org/abs/2012.09841]
+with several modifications. The non-local layers are removed from VQGAN for
+faster speed.
+"""
+import math
+import torch
+import torch.nn.functional as F
+from torch import nn
+# Conv2D with same padding
+class Conv2dSame(nn.Conv2d):
+    def calc_same_pad(self, i: int, k: int, s: int, d: int) -> int:
+        return max((math.ceil(i / s) - 1) * s + (k - 1) * d + 1 - i, 0)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        ih, iw = x.size()[-2:]
+        pad_h = self.calc_same_pad(i=ih, k=self.kernel_size[0], s=self.stride[0], d=self.dilation[0])
+        pad_w = self.calc_same_pad(i=iw, k=self.kernel_size[1], s=self.stride[1], d=self.dilation[1])
+        if pad_h > 0 or pad_w > 0:
+            x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2])
+        return super().forward(x)
+class ResnetBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int = None,
+        dropout_prob: float = 0.0,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.out_channels_ = self.in_channels if self.out_channels is None else self.out_channels
+        self.norm1 = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.conv1 = Conv2dSame(self.in_channels, self.out_channels_, kernel_size=3, bias=False)
+        self.norm2 = nn.GroupNorm(num_groups=32, num_channels=self.out_channels_, eps=1e-6, affine=True)
+        self.dropout = nn.Dropout(dropout_prob)
+        self.conv2 = Conv2dSame(self.out_channels_, self.out_channels_, kernel_size=3, bias=False)
+        if self.in_channels != self.out_channels_:
+            self.nin_shortcut = Conv2dSame(self.out_channels_, self.out_channels_, kernel_size=1, bias=False)
+    def forward(self, hidden_states):
+        residual = hidden_states
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = F.silu(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = F.silu(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        if self.in_channels != self.out_channels_:
+            residual = self.nin_shortcut(hidden_states)
+        return hidden_states + residual
+class DownsamplingBlock(nn.Module):
+    def __init__(self, config, block_idx: int):
+        super().__init__()
+        self.config = config
+        self.block_idx = block_idx
+        in_channel_mult = (1,) + tuple(self.config.channel_mult)
+        block_in = self.config.hidden_channels * in_channel_mult[self.block_idx]
+        block_out = self.config.hidden_channels * self.config.channel_mult[self.block_idx]
+        res_blocks = nn.ModuleList()
+        for _ in range(self.config.num_res_blocks):
+            res_blocks.append(ResnetBlock(block_in, block_out, dropout_prob=self.config.dropout))
+            block_in = block_out
+        self.block = res_blocks
+        self.downsample = self.block_idx != self.config.num_resolutions - 1
+    def forward(self, hidden_states):
+        for res_block in self.block:
+            hidden_states = res_block(hidden_states)
+        if self.downsample:
+            hidden_states = F.avg_pool2d(hidden_states, kernel_size=2, stride=2)
+        return hidden_states
+class UpsamplingBlock(nn.Module):
+    def __init__(self, config, block_idx: int):
+        super().__init__()
+        self.config = config
+        self.block_idx = block_idx
+        if self.block_idx == self.config.num_resolutions - 1:
+            block_in = self.config.hidden_channels * self.config.channel_mult[-1]
+        else:
+            block_in = self.config.hidden_channels * self.config.channel_mult[self.block_idx + 1]
+        block_out = self.config.hidden_channels * self.config.channel_mult[self.block_idx]
+        res_blocks = []
+        for _ in range(self.config.num_res_blocks):
+            res_blocks.append(ResnetBlock(block_in, block_out, dropout_prob=self.config.dropout))
+            block_in = block_out
+        self.block = nn.ModuleList(res_blocks)
+        self.add_upsample = self.block_idx != 0
+        if self.add_upsample:
+            self.upsample_conv = Conv2dSame(block_out, block_out, kernel_size=3)
+    def forward(self, hidden_states):
+        for res_block in self.block:
+            hidden_states = res_block(hidden_states)
+        if self.add_upsample:
+            hidden_states = F.interpolate(hidden_states, scale_factor=2.0, mode="nearest")
+            hidden_states = self.upsample_conv(hidden_states)
+        return hidden_states
+class Encoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        # downsampling
+        self.conv_in = Conv2dSame(self.config.num_channels, self.config.hidden_channels, kernel_size=3, bias=False)
+        downsample_blocks = []
+        for i_level in range(self.config.num_resolutions):
+            downsample_blocks.append(DownsamplingBlock(self.config, block_idx=i_level))
+        self.down = nn.ModuleList(downsample_blocks)
+        # middle
+        mid_channels = self.config.hidden_channels * self.config.channel_mult[-1]
+        res_blocks = nn.ModuleList()
+        for _ in range(self.config.num_res_blocks):
+            res_blocks.append(ResnetBlock(mid_channels, mid_channels, dropout_prob=self.config.dropout))
+        self.mid = res_blocks
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=mid_channels, eps=1e-6, affine=True)
+        self.conv_out = Conv2dSame(mid_channels, self.config.z_channels, kernel_size=1)
+    def forward(self, pixel_values):
+        # downsampling
+        hidden_states = self.conv_in(pixel_values)
+        for block in self.down:
+            hidden_states = block(hidden_states)
+        # middle
+        for block in self.mid:
+            hidden_states = block(hidden_states)
+        # end
+        hidden_states = self.norm_out(hidden_states)
+        hidden_states = F.silu(hidden_states)
+        hidden_states = self.conv_out(hidden_states)
+        return hidden_states
+class Decoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        # compute in_channel_mult, block_in and curr_res at lowest res
+        block_in = self.config.hidden_channels * self.config.channel_mult[self.config.num_resolutions - 1]
+        curr_res = self.config.resolution // 2 ** (self.config.num_resolutions - 1)
+        self.z_shape = (1, self.config.z_channels, curr_res, curr_res)
+        # z to block_in
+        self.conv_in = Conv2dSame(self.config.z_channels, block_in, kernel_size=3)
+        # middle
+        res_blocks = nn.ModuleList()
+        for _ in range(self.config.num_res_blocks):
+            res_blocks.append(ResnetBlock(block_in, block_in, dropout_prob=self.config.dropout))
+        self.mid = res_blocks
+        # upsampling
+        upsample_blocks = []
+        for i_level in reversed(range(self.config.num_resolutions)):
+            upsample_blocks.append(UpsamplingBlock(self.config, block_idx=i_level))
+        self.up = nn.ModuleList(list(reversed(upsample_blocks)))  # reverse to get consistent order
+        # end
+        block_out = self.config.hidden_channels * self.config.channel_mult[0]
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_out, eps=1e-6, affine=True)
+        self.conv_out = Conv2dSame(block_out, self.config.num_channels, kernel_size=3)
+    def forward(self, hidden_states):
+        # z to block_in
+        hidden_states = self.conv_in(hidden_states)
+        # middle
+        for block in self.mid:
+            hidden_states = block(hidden_states)
+        # upsampling
+        for block in reversed(self.up):
+            hidden_states = block(hidden_states)
+        # end
+        hidden_states = self.norm_out(hidden_states)
+        hidden_states = F.silu(hidden_states)
+        hidden_states = self.conv_out(hidden_states)
+        return hidden_states
+class VectorQuantizer(nn.Module):
+    """
+    see https://github.com/MishaLaskin/vqvae/blob/d761a999e2267766400dc646d82d3ac3657771d4/models/quantizer.py
+    Discretization bottleneck part of the VQ-VAE.
+    """
+    def __init__(self, num_embeddings, embedding_dim, commitment_cost):
+        r"""
+        Args:
+            num_embeddings: number of vectors in the quantized space.
+            embedding_dim: dimensionality of the tensors in the quantized space.
+                Inputs to the modules must be in this format as well.
+            commitment_cost: scalar which controls the weighting of the loss terms
+                (see equation 4 in the paper https://arxiv.org/abs/1711.00937 - this variable is Beta).
+        """
+        super().__init__()
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        self.commitment_cost = commitment_cost
+        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
+        self.embedding.weight.data.uniform_(-1.0 / num_embeddings, 1.0 / num_embeddings)
+    def forward(self, hidden_states, return_loss=False):
+        """
+        Inputs the output of the encoder network z and maps it to a discrete one-hot vector that is the index of the
+        closest embedding vector e_j z (continuous) -> z_q (discrete) z.shape = (batch, channel, height, width)
+        quantization pipeline:
+            1. get encoder input (B,C,H,W)
+            2. flatten input to (B*H*W,C)
+        """
+        # reshape z -> (batch, height, width, channel) and flatten
+        hidden_states = hidden_states.permute(0, 2, 3, 1).contiguous()
+        distances = self.compute_distances(hidden_states)
+        min_encoding_indices = torch.argmin(distances, axis=1).unsqueeze(1)
+        min_encodings = torch.zeros(min_encoding_indices.shape[0], self.num_embeddings).to(hidden_states)
+        min_encodings.scatter_(1, min_encoding_indices, 1)
+        # get quantized latent vectors
+        z_q = torch.matmul(min_encodings, self.embedding.weight).view(hidden_states.shape)
+        # reshape to (batch, num_tokens)
+        min_encoding_indices = min_encoding_indices.reshape(hidden_states.shape[0], -1)
+        # compute loss for embedding
+        loss = None
+        if return_loss:
+            loss = torch.mean((z_q.detach() - hidden_states) ** 2) + self.commitment_cost * torch.mean(
+                (z_q - hidden_states.detach()) ** 2
+            )
+            # preserve gradients
+            z_q = hidden_states + (z_q - hidden_states).detach()
+        # reshape back to match original input shape
+        z_q = z_q.permute(0, 3, 1, 2).contiguous()
+        return z_q, min_encoding_indices, loss
+    def compute_distances(self, hidden_states):
+        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
+        hidden_states_flattended = hidden_states.reshape((-1, self.embedding_dim))
+        emb_weights = self.embedding.weight.t()
+        inputs_norm_sq = hidden_states_flattended.pow(2.0).sum(dim=1, keepdim=True)
+        codebook_t_norm_sq = emb_weights.pow(2.0).sum(dim=0, keepdim=True)
+        distances = torch.addmm(
+            inputs_norm_sq + codebook_t_norm_sq,
+            hidden_states_flattended,
+            emb_weights,
+            alpha=-2.0,
+        )
+        return distances
+    def get_codebook_entry(self, indices):
+        # indices are expected to be of shape (batch, num_tokens)
+        # get quantized latent vectors
+        if len(indices.shape) == 2:
+            batch, num_tokens = indices.shape
+            z_q = self.embedding(indices)
+            z_q = z_q.reshape(batch, int(math.sqrt(num_tokens)), int(math.sqrt(num_tokens)), -1).permute(0, 3, 1, 2)
+        elif len(indices.shape) == 3:
+            batch, height, width = indices.shape
+            indices = indices.view(batch, -1)
+            z_q = self.embedding(indices)
+            z_q = z_q.reshape(batch, height, width, -1).permute(0, 3, 1, 2)
+        else:
+            print(indices.shape)
+            raise NotImplementedError
+        return z_q
+    # adapted from https://github.com/kakaobrain/rq-vae-transformer/blob/main/rqvae/models/rqvae/quantizations.py#L372
+    def get_soft_code(self, hidden_states, temp=1.0, stochastic=False):
+        hidden_states = hidden_states.permute(0, 2, 3, 1).contiguous()  # (batch, height, width, channel)
+        distances = self.compute_distances(hidden_states)  # (batch * height * width, num_embeddings)
+        soft_code = F.softmax(-distances / temp, dim=-1)  # (batch * height * width, num_embeddings)
+        if stochastic:
+            code = torch.multinomial(soft_code, 1)  # (batch * height * width, 1)
+        else:
+            code = distances.argmin(dim=-1)  # (batch * height * width)
+        code = code.reshape(hidden_states.shape[0], -1)  # (batch, height * width)
+        batch, num_tokens = code.shape
+        soft_code = soft_code.reshape(batch, num_tokens, -1)  # (batch, height * width, num_embeddings)
+        return soft_code, code
+    def get_code(self, hidden_states):
+        # reshape z -> (batch, height, width, channel)
+        hidden_states = hidden_states.permute(0, 2, 3, 1).contiguous()
+        distances = self.compute_distances(hidden_states)
+        indices = torch.argmin(distances, axis=1).unsqueeze(1)
+        indices = indices.reshape(hidden_states.shape[0], -1)
+        return indices

modeling/quantizer.py ADDED Viewed

	@@ -0,0 +1,92 @@

+"""Vector quantizer.
+Copyright (2024) Bytedance Ltd. and/or its affiliates
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+Reference:
+    https://github.com/CompVis/taming-transformers/blob/master/taming/modules/vqvae/quantize.py
+    https://github.com/google-research/magvit/blob/main/videogvt/models/vqvae.py
+"""
+from typing import Mapping, Text, Tuple
+import torch
+from einops import rearrange
+from torch.cuda.amp import autocast
+class VectorQuantizer(torch.nn.Module):
+    def __init__(self,
+                 codebook_size: int = 1024,
+                 token_size: int = 256,
+                 commitment_cost: float = 0.25,
+                 use_l2_norm: bool = False,
+                 ):
+        super().__init__()
+        self.commitment_cost = commitment_cost
+        self.embedding = torch.nn.Embedding(codebook_size, token_size)
+        self.embedding.weight.data.uniform_(-1.0 / codebook_size, 1.0 / codebook_size)
+        self.use_l2_norm = use_l2_norm
+    # Ensure quantization is performed using f32
+    @autocast(enabled=False)
+    def forward(self, z: torch.Tensor) -> Tuple[torch.Tensor, Mapping[Text, torch.Tensor]]:
+        z = z.float()
+        z = rearrange(z, 'b c h w -> b h w c').contiguous()
+        z_flattened = rearrange(z, 'b h w c -> (b h w) c')
+        if self.use_l2_norm:
+            z_flattened = torch.nn.functional.normalize(z_flattened, dim=-1)
+            embedding = torch.nn.functional.normalize(self.embedding.weight, dim=-1)
+        else:
+            embedding = self.embedding.weight
+        d = torch.sum(z_flattened**2, dim=1, keepdim=True) + \
+            torch.sum(embedding**2, dim=1) - 2 * \
+            torch.einsum('bd,dn->bn', z_flattened, embedding.T)
+        min_encoding_indices = torch.argmin(d, dim=1) # num_ele
+        z_quantized = self.get_codebook_entry(min_encoding_indices).view(z.shape)
+        if self.use_l2_norm:
+            z_quantized = torch.nn.functional.normalize(z_quantized, dim=-1)
+            z = torch.nn.functional.normalize(z, dim=-1)
+        # compute loss for embedding
+        commitment_loss = self.commitment_cost * torch.mean((z_quantized.detach() - z) **2)
+        codebook_loss = torch.mean((z_quantized - z.detach()) **2)
+        loss = commitment_loss + codebook_loss
+        # preserve gradients
+        z_quantized = z + (z_quantized - z).detach()
+        # reshape back to match original input shape
+        z_quantized = rearrange(z_quantized, 'b h w c -> b c h w').contiguous()
+        result_dict = dict(
+            quantizer_loss=loss,
+            commitment_loss=commitment_loss,
+            codebook_loss=codebook_loss,
+            min_encoding_indices=min_encoding_indices.view(z_quantized.shape[0], z_quantized.shape[2], z_quantized.shape[3])
+        )
+        return z_quantized, result_dict
+    def get_codebook_entry(self, indices):
+        if len(indices.shape) == 1:
+            z_quantized = self.embedding(indices)
+        elif len(indices.shape) == 2:
+            z_quantized = torch.einsum('bd,dn->bn', indices, self.embedding.weight)
+        else:
+            raise NotImplementedError
+        return z_quantized

modeling/titok.py ADDED Viewed

	@@ -0,0 +1,97 @@

+"""This file contains the model definition of TiTok.
+Copyright (2024) Bytedance Ltd. and/or its affiliates
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import torch
+import torch.nn as nn
+from einops import rearrange
+from .blocks import TiTokEncoder, TiTokDecoder
+from .quantizer import VectorQuantizer
+from .maskgit_vqgan import Decoder as Pixel_Decoder
+from .maskgit_vqgan import VectorQuantizer as Pixel_Quantizer
+from omegaconf import OmegaConf
+class TiTok(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.encoder = TiTokEncoder(config)
+        self.decoder = TiTokDecoder(config)
+        self.num_latent_tokens = config.model.vq_model.num_latent_tokens
+        scale = self.encoder.width ** -0.5
+        self.latent_tokens = nn.Parameter(
+            scale * torch.randn(self.num_latent_tokens, self.encoder.width))
+        self.apply(self._init_weights)
+        self.quantize = VectorQuantizer(
+            codebook_size=config.model.vq_model.codebook_size,
+            token_size=config.model.vq_model.token_size,
+            commitment_cost=config.model.vq_model.commitment_cost,
+            use_l2_norm=config.model.vq_model.use_l2_norm,)
+        self.pixel_quantize = Pixel_Quantizer(
+            num_embeddings=1024, embedding_dim=256, commitment_cost=0.25)
+        self.pixel_decoder = Pixel_Decoder(OmegaConf.create(
+            {"channel_mult": [1, 1, 2, 2, 4],
+             "num_resolutions": 5,
+             "dropout": 0.0,
+             "hidden_channels": 128,
+             "num_channels": 3,
+             "num_res_blocks": 2,
+             "resolution": 256,
+             "z_channels": 256}))
+    def _init_weights(self, module):
+        """ Initialize the weights.
+            :param:
+                module -> torch.nn.Module: module to initialize
+        """
+        if isinstance(module, nn.Linear) or isinstance(module, nn.Conv1d) or isinstance(module, nn.Conv2d):
+            module.weight.data = nn.init.trunc_normal_(module.weight.data, mean=0.0, std=0.02)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data = nn.init.trunc_normal_(module.weight.data, mean=0.0, std=0.02)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def encode(self, x):
+        z = self.encoder(pixel_values=x, latent_tokens=self.latent_tokens)
+        z_quantized, result_dict = self.quantize(z)
+        return z_quantized, result_dict
+    def decode(self, z_quantized):
+        decoded_latent = self.decoder(z_quantized)
+        quantized_states = torch.einsum(
+            'nchw,cd->ndhw', decoded_latent.softmax(1),
+            self.pixel_quantize.embedding.weight)
+        decoded = self.pixel_decoder(quantized_states)
+        return decoded
+    def decode_tokens(self, tokens):
+        tokens = tokens.squeeze(1)
+        batch, seq_len = tokens.shape # B x N
+        z_quantized = self.quantize.get_codebook_entry(
+            tokens.reshape(-1)).reshape(batch, 1, seq_len, -1)
+        if self.quantize.use_l2_norm:
+            z_quantized = torch.nn.functional.normalize(z_quantized, dim=-1)
+        z_quantized = rearrange(z_quantized, 'b h w c -> b c h w').contiguous()
+        decoded = self.decode(z_quantized)
+        return decoded

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+torch>=2.0.0
+torchvision
+omegaconf
+transformers
+timm
+open_clip_torch
+einops
+scipy
+pillow
+accelerate
+gdown