k4d3
/

toolkit

Safetensors

Model card Files Files and versions Community

k4d3 commited on Sep 26

Commit

56166a4

•

1 Parent(s): b7d52fe

Code health maintenance + generate_valid_caption

Browse files

Files changed (1) hide show

joy +169 -68

joy CHANGED Viewed

@@ -19,6 +19,7 @@ import argparse
 import re
 import random
 from pathlib import Path
 from PIL import Image
 import pillow_jxl
 import torch
@@ -32,7 +33,6 @@ from transformers import (
 )
 from torch import nn
 from e6db_reader import TagSetNormalizer, tag_category2id, tag_rank_to_freq
-from typing import List, Tuple, Dict
 CLIP_PATH = "google/siglip-so400m-patch14-384"
 MODEL_PATH = "meta-llama/Meta-Llama-3.1-8B"
@@ -62,7 +62,8 @@ CAPTION_TYPE_MAP = {
         "Write a stable diffusion prompt for this image."
     ],
     ("training_prompt", "formal", False, True): [
-        "Write a stable diffusion prompt for this image within {word_count} " "words."
     ],
     ("training_prompt", "formal", True, False): [
         "Write a {length} stable diffusion prompt for this image."
@@ -90,12 +91,18 @@ class ImageAdapter(nn.Module):
     embeddings, and deep feature extraction.
     Args:
-        input_features (int): Number of input features from the vision model.
-        output_features (int): Number of output features to match the text model.
-        ln1 (bool): Whether to use layer normalization.
-        pos_emb (bool): Whether to use positional embeddings.
-        num_image_tokens (int): Number of image tokens.
-        deep_extract (bool): Whether to use deep feature extraction.
     """
     def __init__(
@@ -131,7 +138,8 @@ class ImageAdapter(nn.Module):
         Forward pass of the image adapter.
         Args:
-            vision_outputs (torch.Tensor): Output tensor from the CLIP vision model.
         Returns:
             torch.Tensor: Adapted image features.
@@ -148,9 +156,10 @@ class ImageAdapter(nn.Module):
                 dim=-1,
             )
             assert len(x.shape) == 3, f"Expected 3, got {len(x.shape)}"
             assert (
-                x.shape[-1] == vision_outputs[-2].shape[-1] * 5
-            ), f"Expected {vision_outputs[-2].shape[-1] * 5}, got {x.shape[-1]}"
         else:
             x = vision_outputs[-2]
@@ -167,9 +176,8 @@ class ImageAdapter(nn.Module):
         x = self.linear2(x)
         other_tokens = self.other_tokens(
-            torch.tensor([0, 1], device=self.other_tokens.weight.device).expand(
-                x.shape[0], -1
-            )
         )
         assert other_tokens.shape == (
             x.shape[0],
@@ -194,10 +202,13 @@ class ImageAdapter(nn.Module):
 class JoyCaptionModel:
     """
-    A class for generating captions for images using CLIP, LLM, and custom image adapters.
-    This class encapsulates the functionality to load and initialize various models
-    (CLIP, LLM, image adapter) and use them to process images and generate captions.
     It supports different caption types, tones, and lengths.
     Attributes:
@@ -209,7 +220,8 @@ class JoyCaptionModel:
     Methods:
         load_models(): Load and initialize all required models.
         process_image(input_image, caption_type, caption_tone, caption_length):
-            Process an input image and generate a caption based on specified parameters.
     """
     def __init__(self):
@@ -232,7 +244,8 @@ class JoyCaptionModel:
                 CHECKPOINT_PATH / "clip_model.pt", map_location="cpu"
             )
             checkpoint = {
-                k.replace("_orig_mod.module.", ""): v for k, v in checkpoint.items()
             }
             self.clip_model.load_state_dict(checkpoint)
             del checkpoint
@@ -242,16 +255,20 @@ class JoyCaptionModel:
         self.clip_model.to("cuda")
         print("Loading tokenizer")
-        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False)
-        assert isinstance(self.tokenizer, PreTrainedTokenizer) or isinstance(
-            self.tokenizer, PreTrainedTokenizerFast
-        ), f"Tokenizer is of type {type(self.tokenizer)}"
         print("Loading LLM")
         if (CHECKPOINT_PATH / "text_model").exists():
             print("Loading VLM's custom text model")
             self.text_model = AutoModelForCausalLM.from_pretrained(
-                CHECKPOINT_PATH / "text_model", device_map=0, torch_dtype=torch.bfloat16
             )
         else:
             self.text_model = AutoModelForCausalLM.from_pretrained(
@@ -270,7 +287,10 @@ class JoyCaptionModel:
             False,
         )
         self.image_adapter.load_state_dict(
-            torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location="cpu")
         )
         self.image_adapter.eval()
         self.image_adapter.to("cuda")
@@ -285,7 +305,8 @@ class JoyCaptionModel:
         custom_prompt: str | None = None,
     ) -> str:
         """
-        Process an input image and generate a caption based on specified parameters.
         """
         torch.cuda.empty_cache()
@@ -305,11 +326,39 @@ class JoyCaptionModel:
             embedded_images, prompt
         )
-        generate_ids = self._generate_caption(inputs_embeds, input_ids, attention_mask)
         caption = self._decode_caption(generate_ids, input_ids)
         return caption.strip()
     def _get_prompt_string(self, caption_type, caption_tone, caption_length):
         length = None if caption_length == "any" else caption_length
@@ -400,10 +449,16 @@ class JoyCaptionModel:
         input_ids = torch.cat(
             [
-                torch.tensor([[self.tokenizer.bos_token_id]], dtype=torch.long),
-                torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
                 prompt,
-                torch.tensor([[self.tokenizer.eos_token_id]], dtype=torch.long),
             ],
             dim=1,
         ).to("cuda")
@@ -423,23 +478,31 @@ class JoyCaptionModel:
         return generate_ids
     def _decode_caption(self, generate_ids, input_ids):
-        generate_ids = generate_ids[:, input_ids.shape[1] :]
-        if generate_ids[0][-1] == self.tokenizer.eos_token_id or generate_ids[0][
-            -1
-        ] == self.tokenizer.convert_tokens_to_ids("<|eot_id|>"):
             generate_ids = generate_ids[:, :-1]
         caption = self.tokenizer.batch_decode(
-            generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False
         )[0]
         return caption
 def main():
-    """Generate captions for images in a directory and save them as .caption files."""
     parser = argparse.ArgumentParser(
-        description="Generate captions for images in a directory and save them as .caption files."
     )
     parser.add_argument(
         "directory", type=str, help="Target directory containing images."
@@ -459,17 +522,25 @@ def main():
         help="Tone of the caption.",
     )
     parser.add_argument(
-        "--caption_length", type=str, default="any", help="Length of the caption."
     )
     parser.add_argument(
         "--dont-strip-commas",
         action="store_true",
-        help="If set, commas will not be stripped from the generated captions.",
     )
     parser.add_argument(
         "--custom_prompt",
         type=str,
-        help="Custom prompt for the captioner. Use with --caption_type custom.",
     )
     parser.add_argument(
         "--add-commas-to-sentence-ends",
@@ -481,19 +552,28 @@ def main():
         type=int,
         nargs="?",
         const=-1,
-        help="Use .txt files with the same base filename as the images as input to the captioner. Optionally specify the number of tags to use.",
     )
     parser.add_argument(
         "--random-tags",
         type=int,
-        help="Randomly select n number of tags. Only works if --feed-from-tags is enabled.",
     )
     args = parser.parse_args()
     # Validate random-tags usage
     if args.random_tags is not None and args.feed_from_tags is None:
-        parser.error("--random-tags can only be used when --feed-from-tags is enabled")
     print("Loading e621 tag data")
     tagset_normalizer = make_tagset_normalizer()
@@ -504,9 +584,13 @@ def main():
     # Validate custom prompt usage
     if args.caption_type == "custom" and not args.custom_prompt:
-        parser.error("--custom_prompt is required when using --caption_type custom")
     elif args.caption_type != "custom" and args.custom_prompt:
-        parser.error("--custom_prompt can only be used with --caption_type custom")
     image_extensions = {".webp", ".png", ".jpeg", ".jpg", ".jxl"}
     for image_path in Path(args.directory).rglob("*"):
@@ -525,11 +609,13 @@ def main():
             if args.caption_type == "custom":
                 custom_prompt = args.custom_prompt
             elif args.feed_from_tags is not None:
-                custom_prompt = prompt_from_tags(args, image_path, tagset_normalizer)
             print(f"Custom prompt: {custom_prompt}")
-            caption = joy_caption_model.process_image(
                 input_image,
                 args.caption_type,
                 args.caption_tone,
@@ -611,16 +697,19 @@ def make_tagset_normalizer():
     return tagset_normalizer.map_inputs(input_map, on_conflict="ignore")
-def format_nl_list(l):
-    n = len(l)
     assert n > 0
     if n == 1:
-        return l[0]
-    elif n == 2:
-        return f"{l[0]} and {l[1]}"
-    else:  # n > 2
-        *head, last = l
-        return ", ".join(head) + ", and " + last
 TAG_SPECIES = tag_category2id["species"]
@@ -631,14 +720,17 @@ TAG_META = tag_category2id["meta"]
 TAG_FREQ_THRESH = 0
-def prompt_from_tags(args, image_path: Path, tagset_normalizer: TagSetNormalizer):
     """
     Generates a prompt from tags associated with the given image.
     Args:
         args: Additional arguments for the function.
-        image_path (Path): The path to the image file.
-        tagset_normalizer (TagSetNormalizer): An instance to normalize the tag set.
     Returns:
         None
@@ -655,7 +747,8 @@ def prompt_from_tags(args, image_path: Path, tagset_normalizer: TagSetNormalizer
     # These lists contain tuples (freq, tag, tag_id)
     tag_by_category: Dict[int, List[Tuple[int, str, int]]] = {
-        cat: [] for cat in [TAG_ARTIST, TAG_CHARACTER, TAG_COPYRIGHT, TAG_SPECIES]
     }
     other_tags: List[Tuple[int, str, int]] = []
     implied: set = set()
@@ -664,8 +757,8 @@ def prompt_from_tags(args, image_path: Path, tagset_normalizer: TagSetNormalizer
         # Encode the tag into a numerical id
         tag_id = encode(tag.replace(" ", "_"))
         if tag_id is None:
-            other_tags.append((0, tag, None))
-            implied.update(tagset_normalizer.implications_rej.get(tag_id, ()))
             continue
         # Get the category of the tag
         cat_id = tag_id_to_cat_id[tag_id]
@@ -677,13 +770,16 @@ def prompt_from_tags(args, image_path: Path, tagset_normalizer: TagSetNormalizer
         freq = tag_rank_to_freq(tag_id)
         if freq < TAG_FREQ_THRESH:
             continue
-        tag_by_category.get(cat_id, other_tags).append((int(freq), tag, tag_id))
     other_tags = sorted(
         (int(freq), tag, tag_id)
         for freq, tag, tag_id in other_tags
         if tag_id not in implied
     )
     for cat_id, cat_list in tag_by_category.items():
         tag_by_category[cat_id] = sorted(
             (int(freq), tag, tag_id)
@@ -696,8 +792,8 @@ def prompt_from_tags(args, image_path: Path, tagset_normalizer: TagSetNormalizer
         num_tags = min(args.random_tags, len(other_tags))
         other_tags = random.sample(
             [
-                (i, tag, tag_id)
-                for i, tag, tag_id in enumerate(tags[: round(args.random_tags * 1.5)])
             ],
             num_tags,
         )
@@ -713,25 +809,30 @@ def prompt_from_tags(args, image_path: Path, tagset_normalizer: TagSetNormalizer
         artist_txt = f"by {format_nl_list(artist_list)}"
     else:
         artist_txt = ""
     character_tag = tag_by_category[TAG_CHARACTER]
     if character_tag:
         tags = [tag for _, tag, _ in character_tag[:4]]
         character_txt = f"named {format_nl_list(tags)}"
     else:
         character_txt = ""
     species_tag = tag_by_category[TAG_SPECIES]
     if species_tag:
-        species_txt = "of a " if len(character_tag) <= 1 and len(species_tag) <= 1 else "of "
         species_txt += format_nl_list([tp[1] for tp in species_tag[:4]])
     else:
         if character_tag:
             species_txt = (
-                " a character"
-                if len(character_tag) <= 1
-                else " characters"
             )
         else:
             species_txt = ""
     copyright_tag = tag_by_category[TAG_COPYRIGHT]
     if copyright_tag:
         tags = [tag for _, tag, *_ in copyright_tag[:4]]

 import re
 import random
 from pathlib import Path
+from typing import List, Tuple, Dict
 from PIL import Image
 import pillow_jxl
 import torch
 )
 from torch import nn
 from e6db_reader import TagSetNormalizer, tag_category2id, tag_rank_to_freq
 CLIP_PATH = "google/siglip-so400m-patch14-384"
 MODEL_PATH = "meta-llama/Meta-Llama-3.1-8B"
         "Write a stable diffusion prompt for this image."
     ],
     ("training_prompt", "formal", False, True): [
+        "Write a stable diffusion prompt for this image within " +
+        "{word_count} words."
     ],
     ("training_prompt", "formal", True, False): [
         "Write a {length} stable diffusion prompt for this image."
     embeddings, and deep feature extraction.
     Args:
+        input_features (int):
+            Number of input features from the vision model.
+        output_features (int):
+            Number of output features to match the text model.
+        ln1 (bool):
+            Whether to use layer normalization.
+        pos_emb (bool):
+            Whether to use positional embeddings.
+        num_image_tokens (int):
+            Number of image tokens.
+        deep_extract (bool):
+            Whether to use deep feature extraction.
     """
     def __init__(
         Forward pass of the image adapter.
         Args:
+            vision_outputs (torch.Tensor):
+                Output tensor from the CLIP vision model.
         Returns:
             torch.Tensor: Adapted image features.
                 dim=-1,
             )
             assert len(x.shape) == 3, f"Expected 3, got {len(x.shape)}"
+            expected_shape = vision_outputs[-2].shape[-1] * 5
             assert (
+                x.shape[-1] == expected_shape
+            ), f"Expected {expected_shape}, got {x.shape[-1]}"
         else:
             x = vision_outputs[-2]
         x = self.linear2(x)
         other_tokens = self.other_tokens(
+            torch.tensor([0, 1], device=self.other_tokens.weight.device)
+            .expand(x.shape[0], -1)
         )
         assert other_tokens.shape == (
             x.shape[0],
 class JoyCaptionModel:
     """
+    A class for generating captions for images using CLIP, LLM,
+    and custom image adapters.
+    This class encapsulates the functionality to load and initialize
+    various models (CLIP, LLM, image adapter) and use them to process
+    images and generate captions.
     It supports different caption types, tones, and lengths.
     Attributes:
     Methods:
         load_models(): Load and initialize all required models.
         process_image(input_image, caption_type, caption_tone, caption_length):
+            Process an input image and generate a caption
+            based on specified parameters.
     """
     def __init__(self):
                 CHECKPOINT_PATH / "clip_model.pt", map_location="cpu"
             )
             checkpoint = {
+                k.replace("_orig_mod.module.", ""): v
+                for k, v in checkpoint.items()
             }
             self.clip_model.load_state_dict(checkpoint)
             del checkpoint
         self.clip_model.to("cuda")
         print("Loading tokenizer")
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            MODEL_PATH, use_fast=False
+        )
+        assert isinstance(
+            self.tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)
+        )
         print("Loading LLM")
         if (CHECKPOINT_PATH / "text_model").exists():
             print("Loading VLM's custom text model")
             self.text_model = AutoModelForCausalLM.from_pretrained(
+                CHECKPOINT_PATH / "text_model",
+                device_map=0,
+                torch_dtype=torch.bfloat16
             )
         else:
             self.text_model = AutoModelForCausalLM.from_pretrained(
             False,
         )
         self.image_adapter.load_state_dict(
+            torch.load(
+                CHECKPOINT_PATH / "image_adapter.pt",
+                map_location="cpu"
+            )
         )
         self.image_adapter.eval()
         self.image_adapter.to("cuda")
         custom_prompt: str | None = None,
     ) -> str:
         """
+        Process an input image and generate a caption based on specified
+        parameters.
         """
         torch.cuda.empty_cache()
             embedded_images, prompt
         )
+        generate_ids = self._generate_caption(inputs_embeds,
+                                              input_ids,
+                                              attention_mask)
         caption = self._decode_caption(generate_ids, input_ids)
         return caption.strip()
+    def generate_valid_caption(
+        self,
+        input_image: Image.Image,
+        caption_type: str,
+        caption_tone: str,
+        caption_length: str | int,
+        custom_prompt: str | None = None,
+    ) -> str:
+        """
+        Generate a valid caption, retrying if the caption contains only special
+        characters or does not end with a period, exclamation mark, or
+        question mark.
+        """
+        while True:
+            caption = self.process_image(
+                input_image, caption_type, caption_tone,
+                caption_length, custom_prompt
+            )
+            # This regex checks if the caption contains at least one word character
+            # and ends with a period, exclamation mark, or question mark.
+            # \w matches any word character (letters, digits, or underscore)
+            # caption[-1] checks the last character of the caption
+            if re.search(r'\w', caption) and caption[-1] in {'.', '!', '?'}:
+                return caption
+            print("Generated caption is invalid. Retrying...")
     def _get_prompt_string(self, caption_type, caption_tone, caption_length):
         length = None if caption_length == "any" else caption_length
         input_ids = torch.cat(
             [
+                torch.tensor(
+                    [[self.tokenizer.bos_token_id]], dtype=torch.long
+                ),
+                torch.zeros(
+                    (1, embedded_images.shape[1]), dtype=torch.long
+                ),
                 prompt,
+                torch.tensor(
+                    [[self.tokenizer.eos_token_id]], dtype=torch.long
+                ),
             ],
             dim=1,
         ).to("cuda")
         return generate_ids
     def _decode_caption(self, generate_ids, input_ids):
+        generate_ids = generate_ids[:, input_ids.shape[1]:]
+        if (generate_ids[0][-1] == self.tokenizer.eos_token_id or
+            generate_ids[0][-1] == self.tokenizer.convert_tokens_to_ids(
+                "<|eot_id|>")):
             generate_ids = generate_ids[:, :-1]
         caption = self.tokenizer.batch_decode(
+            generate_ids,
+            skip_special_tokens=False,
+            clean_up_tokenization_spaces=False
         )[0]
         return caption
 def main():
+    """
+    Generate captions for images in a directory
+    and save them as .caption files.
+    """
     parser = argparse.ArgumentParser(
+        description=(
+            "Generate captions for images in a directory and save them as "
+            ".caption files."
+        )
     )
     parser.add_argument(
         "directory", type=str, help="Target directory containing images."
         help="Tone of the caption.",
     )
     parser.add_argument(
+        "--caption_length",
+        type=str,
+        default="any",
+        help="Length of the caption."
     )
     parser.add_argument(
         "--dont-strip-commas",
         action="store_true",
+        help=(
+            "If set, commas will not be stripped from the generated captions."
+        ),
     )
     parser.add_argument(
         "--custom_prompt",
         type=str,
+        help=(
+            "Custom prompt for the captioner. "
+            "Use with --caption_type custom."
+        ),
     )
     parser.add_argument(
         "--add-commas-to-sentence-ends",
         type=int,
         nargs="?",
         const=-1,
+        help=(
+            "Use .txt files with the same base filename "
+            "as the images as input to the captioner. "
+            "Optionally specify the number of tags to use."
+        ),
     )
     parser.add_argument(
         "--random-tags",
         type=int,
+        help=(
+            "Randomly select n number of tags. "
+            "Only works if --feed-from-tags is enabled."
+        ),
     )
     args = parser.parse_args()
     # Validate random-tags usage
     if args.random_tags is not None and args.feed_from_tags is None:
+        parser.error(
+            "--random-tags can only be used when --feed-from-tags is enabled"
+        )
     print("Loading e621 tag data")
     tagset_normalizer = make_tagset_normalizer()
     # Validate custom prompt usage
     if args.caption_type == "custom" and not args.custom_prompt:
+        parser.error(
+            "--custom_prompt is required when using --caption_type custom"
+        )
     elif args.caption_type != "custom" and args.custom_prompt:
+        parser.error(
+            "--custom_prompt can only be used with --caption_type custom"
+        )
     image_extensions = {".webp", ".png", ".jpeg", ".jpg", ".jxl"}
     for image_path in Path(args.directory).rglob("*"):
             if args.caption_type == "custom":
                 custom_prompt = args.custom_prompt
             elif args.feed_from_tags is not None:
+                custom_prompt = prompt_from_tags(
+                    args, image_path, tagset_normalizer
+                )
             print(f"Custom prompt: {custom_prompt}")
+            caption = joy_caption_model.generate_valid_caption(
                 input_image,
                 args.caption_type,
                 args.caption_tone,
     return tagset_normalizer.map_inputs(input_map, on_conflict="ignore")
+def format_nl_list(word_list):
+    """
+    Takes a list of words and generates a natural language output.
+    """
+    n = len(word_list)
     assert n > 0
     if n == 1:
+        return word_list[0]
+    if n == 2:
+        return f"{word_list[0]} and {word_list[1]}"
+    # n > 2
+    *head, last = word_list
+    return ", ".join(head) + ", and " + last
 TAG_SPECIES = tag_category2id["species"]
 TAG_FREQ_THRESH = 0
+def prompt_from_tags(args, image_path: Path,
+                     tagset_normalizer: TagSetNormalizer):
     """
     Generates a prompt from tags associated with the given image.
     Args:
         args: Additional arguments for the function.
+        image_path (Path):
+            The path to the image file.
+        tagset_normalizer (TagSetNormalizer):
+            An instance to normalize the tag set.
     Returns:
         None
     # These lists contain tuples (freq, tag, tag_id)
     tag_by_category: Dict[int, List[Tuple[int, str, int]]] = {
+        cat: []
+        for cat in [TAG_ARTIST, TAG_CHARACTER, TAG_COPYRIGHT, TAG_SPECIES]
     }
     other_tags: List[Tuple[int, str, int]] = []
     implied: set = set()
         # Encode the tag into a numerical id
         tag_id = encode(tag.replace(" ", "_"))
         if tag_id is None:
+            other_tags.append((0, tag, 0))
+            implied.update(tagset_normalizer.implications_rej.get(0, ()))
             continue
         # Get the category of the tag
         cat_id = tag_id_to_cat_id[tag_id]
         freq = tag_rank_to_freq(tag_id)
         if freq < TAG_FREQ_THRESH:
             continue
+        tag_by_category.get(cat_id, other_tags).append(
+            (int(freq), tag, tag_id)
+        )
     other_tags = sorted(
         (int(freq), tag, tag_id)
         for freq, tag, tag_id in other_tags
         if tag_id not in implied
     )
     for cat_id, cat_list in tag_by_category.items():
         tag_by_category[cat_id] = sorted(
             (int(freq), tag, tag_id)
         num_tags = min(args.random_tags, len(other_tags))
         other_tags = random.sample(
             [
+                (i, tag, 0)
+                for i, tag in enumerate(tags[: round(args.random_tags * 1.5)])
             ],
             num_tags,
         )
         artist_txt = f"by {format_nl_list(artist_list)}"
     else:
         artist_txt = ""
     character_tag = tag_by_category[TAG_CHARACTER]
     if character_tag:
         tags = [tag for _, tag, _ in character_tag[:4]]
         character_txt = f"named {format_nl_list(tags)}"
     else:
         character_txt = ""
     species_tag = tag_by_category[TAG_SPECIES]
     if species_tag:
+        species_txt = (
+            "of a "
+            if len(character_tag) <= 1 and len(species_tag) <= 1
+            else "of "
+        )
         species_txt += format_nl_list([tp[1] for tp in species_tag[:4]])
     else:
         if character_tag:
             species_txt = (
+                " a character" if len(character_tag) <= 1 else " characters"
             )
         else:
             species_txt = ""
     copyright_tag = tag_by_category[TAG_COPYRIGHT]
     if copyright_tag:
         tags = [tag for _, tag, *_ in copyright_tag[:4]]