bad_space_test

Running on Zero

App Files Files Community

Freak-ppa commited on Aug 10, 2024

Commit

b298049

verified ·

1 Parent(s): 311187c

Upload 18 files

Browse files

Files changed (19) hide show

.gitattributes +3 -0
ComfyUI/custom_nodes/img2txt-comfyui-nodes/__init__.py +9 -0
ComfyUI/custom_nodes/img2txt-comfyui-nodes/pyproject.toml +15 -0
ComfyUI/custom_nodes/img2txt-comfyui-nodes/requirements.txt +6 -0
ComfyUI/custom_nodes/img2txt-comfyui-nodes/src/__init__.py +0 -0
ComfyUI/custom_nodes/img2txt-comfyui-nodes/src/blip_img2txt.py +92 -0
ComfyUI/custom_nodes/img2txt-comfyui-nodes/src/description_classifier.py +8 -0
ComfyUI/custom_nodes/img2txt-comfyui-nodes/src/img2txt_node.py +217 -0
ComfyUI/custom_nodes/img2txt-comfyui-nodes/src/img_tensor_utils.py +129 -0
ComfyUI/custom_nodes/img2txt-comfyui-nodes/src/keyword_extract.py +114 -0
ComfyUI/custom_nodes/img2txt-comfyui-nodes/src/llava_img2txt.py +131 -0
ComfyUI/custom_nodes/img2txt-comfyui-nodes/src/mini_cpm_img2txt.py +53 -0
ComfyUI/custom_nodes/img2txt-comfyui-nodes/web/show-output-text.js +51 -0
ComfyUI/custom_nodes/img2txt-comfyui-nodes/wiki/demo-pics/Selection_001.png +3 -0
ComfyUI/custom_nodes/img2txt-comfyui-nodes/wiki/demo-pics/Selection_002.png +3 -0
ComfyUI/custom_nodes/img2txt-comfyui-nodes/wiki/demo-pics/Selection_003.png +3 -0
ComfyUI/custom_nodes/img2txt-comfyui-nodes/wiki/workflow-examples/img2img.json +523 -0
ComfyUI/custom_nodes/img2txt-comfyui-nodes/wiki/workflow-examples/inpaint.json +705 -0
ComfyUI/custom_nodes/img2txt-comfyui-nodes/wiki/workflow-examples/txt2img.json +498 -0

.gitattributes CHANGED Viewed

@@ -62,3 +62,6 @@ ComfyUI/temp/ComfyUI_temp_lhrdf_00001_.png filter=lfs diff=lfs merge=lfs -text
 ComfyUI/temp/ComfyUI_temp_lhrdf_00002_.png filter=lfs diff=lfs merge=lfs -text
 ComfyUI/temp/ComfyUI_temp_pxrdj_00001_.png filter=lfs diff=lfs merge=lfs -text
 ComfyUI/temp/ComfyUI_temp_pxrdj_00002_.png filter=lfs diff=lfs merge=lfs -text

 ComfyUI/temp/ComfyUI_temp_lhrdf_00002_.png filter=lfs diff=lfs merge=lfs -text
 ComfyUI/temp/ComfyUI_temp_pxrdj_00001_.png filter=lfs diff=lfs merge=lfs -text
 ComfyUI/temp/ComfyUI_temp_pxrdj_00002_.png filter=lfs diff=lfs merge=lfs -text
+ComfyUI/custom_nodes/img2txt-comfyui-nodes/wiki/demo-pics/Selection_001.png filter=lfs diff=lfs merge=lfs -text
+ComfyUI/custom_nodes/img2txt-comfyui-nodes/wiki/demo-pics/Selection_002.png filter=lfs diff=lfs merge=lfs -text
+ComfyUI/custom_nodes/img2txt-comfyui-nodes/wiki/demo-pics/Selection_003.png filter=lfs diff=lfs merge=lfs -text

ComfyUI/custom_nodes/img2txt-comfyui-nodes/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from .src.img2txt_node import Img2TxtNode
+NODE_CLASS_MAPPINGS = {
+    "img2txt BLIP/Llava Multimodel Tagger": Img2TxtNode,
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+    "img2txt BLIP/Llava Multimodel Tagger": "Image to Text - Auto Caption"
+}
+WEB_DIRECTORY = "./web"

ComfyUI/custom_nodes/img2txt-comfyui-nodes/pyproject.toml ADDED Viewed

	@@ -0,0 +1,15 @@

+[project]
+name = "img2txt-comfyui-nodes"
+description = "Get general description or specify questions to ask about images (medium, art style, background, etc.). Supports Chinese 🇨🇳 questions via MiniCPM model."
+version = "1.1.4"
+license = "LICENSE"
+dependencies = ["transformers>=4.36.0", "bitsandbytes>=0.43.0", "timm>=1.0.7", "sentencepiece==0.1.99", "accelerate>=0.3.0", "deepspeed"]
+[project.urls]
+Repository = "https://github.com/christian-byrne/img2txt-comfyui-nodes"
+#  Used by Comfy Registry https://comfyregistry.org
+[tool.comfy]
+PublisherId = "christian-byrne"
+DisplayName = "Img2txt - Auto Caption"
+Icon = "https://img.icons8.com/?size=100&id=49374&format=png&color=000000"

ComfyUI/custom_nodes/img2txt-comfyui-nodes/requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+transformers>=4.36.0
+bitsandbytes>=0.43.0
+timm>=1.0.7
+sentencepiece
+accelerate>=0.3.0
+deepspeed

ComfyUI/custom_nodes/img2txt-comfyui-nodes/src/__init__.py ADDED Viewed

File without changes

ComfyUI/custom_nodes/img2txt-comfyui-nodes/src/blip_img2txt.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import os
+from PIL import Image
+from transformers import (
+    BlipProcessor,
+    BlipForConditionalGeneration,
+    BlipConfig,
+    BlipTextConfig,
+    BlipVisionConfig,
+)
+import torch
+import model_management
+import folder_paths
+class BLIPImg2Txt:
+    def __init__(
+        self,
+        conditional_caption: str,
+        min_words: int,
+        max_words: int,
+        temperature: float,
+        repetition_penalty: float,
+        search_beams: int,
+        model_id: str = "Salesforce/blip-image-captioning-large",
+        custom_model_path: str = None,
+    ):
+        self.conditional_caption = conditional_caption
+        self.model_id = model_id
+        self.custom_model_path = custom_model_path
+        if self.custom_model_path and os.path.exists(self.custom_model_path):
+            self.model_path = self.custom_model_path
+        else:
+            self.model_path = folder_paths.get_full_path("blip", model_id)
+        if temperature > 1.1 or temperature < 0.90:
+            do_sample = True
+            num_beams = 1
+        else:
+            do_sample = False
+            num_beams = search_beams if search_beams > 1 else 1
+        self.text_config_kwargs = {
+            "do_sample": do_sample,
+            "max_length": max_words,
+            "min_length": min_words,
+            "repetition_penalty": repetition_penalty,
+            "padding": "max_length",
+        }
+        if not do_sample:
+            self.text_config_kwargs["temperature"] = temperature
+            self.text_config_kwargs["num_beams"] = num_beams
+    def generate_caption(self, image: Image.Image) -> str:
+        if image.mode != "RGB":
+            image = image.convert("RGB")
+        if self.model_path and os.path.exists(self.model_path):
+            model_path = self.model_path
+            local_files_only = True
+        else:
+            model_path = self.model_id
+            local_files_only = False
+        processor = BlipProcessor.from_pretrained(model_path, local_files_only=local_files_only)
+        config_text = BlipTextConfig.from_pretrained(model_path, local_files_only=local_files_only)
+        config_text.update(self.text_config_kwargs)
+        config_vision = BlipVisionConfig.from_pretrained(model_path, local_files_only=local_files_only)
+        config = BlipConfig.from_text_vision_configs(config_text, config_vision)
+        model = BlipForConditionalGeneration.from_pretrained(
+            model_path,
+            config=config,
+            torch_dtype=torch.float16,
+            local_files_only=local_files_only
+        ).to(model_management.get_torch_device())
+        inputs = processor(
+            image,
+            self.conditional_caption,
+            return_tensors="pt",
+        ).to(model_management.get_torch_device(), torch.float16)
+        with torch.no_grad():
+            out = model.generate(**inputs)
+            ret = processor.decode(out[0], skip_special_tokens=True)
+        del model
+        torch.cuda.empty_cache()
+        return ret

ComfyUI/custom_nodes/img2txt-comfyui-nodes/src/description_classifier.py ADDED Viewed

	@@ -0,0 +1,8 @@

+#!pip install transformers[sentencepiece]
+# from transformers import pipeline
+# text = "Angela Merkel is a politician in Germany and leader of the CDU"
+# hypothesis_template = "This text is about {}"
+# classes_verbalized = ["politics", "economy", "entertainment", "environment"]
+# zeroshot_classifier = pipeline("zero-shot-classification", model="MoritzLaurer/deberta-v3-large-zeroshot-v2.0")  # change the model identifier here
+# output = zeroshot_classifier(text, classes_verbalized, hypothesis_template=hypothesis_template, multi_label=False)
+# print(output)

ComfyUI/custom_nodes/img2txt-comfyui-nodes/src/img2txt_node.py ADDED Viewed

	@@ -0,0 +1,217 @@

+"""
+@author: christian-byrne
+@title: Img2Txt auto captioning. Choose from models: BLIP, Llava, MiniCPM, MS-GIT. Use model combos and merge results. Specify questions to ask about images (medium, art style, background). Supports Chinese 🇨🇳 questions via MiniCPM.
+@nickname: Image to Text - Auto Caption
+"""
+import torch
+from torchvision import transforms
+from .img_tensor_utils import TensorImgUtils
+from .llava_img2txt import LlavaImg2Txt
+from .blip_img2txt import BLIPImg2Txt
+from .mini_cpm_img2txt import MiniPCMImg2Txt
+from typing import Tuple
+import os
+import folder_paths
+class Img2TxtNode:
+    CATEGORY = "img2txt"
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "input_image": ("IMAGE",),
+            },
+            "optional": {
+                "use_blip_model": (
+                    "BOOLEAN",
+                    {
+                        "default": True,
+                        "label_on": "Use BLIP (Requires 2Gb Disk)",
+                        "label_off": "Don't use BLIP",
+                    },
+                ),
+                "use_llava_model": (
+                    "BOOLEAN",
+                    {
+                        "default": False,
+                        "label_on": "Use Llava (Requires 15Gb Disk)",
+                        "label_off": "Don't use Llava",
+                    },
+                ),
+                "use_mini_pcm_model": (
+                    "BOOLEAN",
+                    {
+                        "default": False,
+                        "label_on": "Use MiniCPM (Requires 6Gb Disk)",
+                        "label_off": "Don't use MiniCPM",
+                    },
+                ),
+                "use_all_models": (
+                    "BOOLEAN",
+                    {
+                        "default": False,
+                        "label_on": "Use all models and combine outputs (Total Size: 20+Gb)",
+                        "label_off": "Use selected models only",
+                    },
+                ),
+                "blip_caption_prefix": (
+                    "STRING",
+                    {
+                        "default": "a photograph of",
+                    },
+                ),
+                "prompt_questions": (
+                    "STRING",
+                    {
+                        "default": "What is the subject of this image?\nWhat are the mediums used to make this?\nWhat are the artistic styles this is reminiscent of?\nWhich famous artists is this reminiscent of?\nHow sharp or detailed is this image?\nWhat is the environment and background of this image?\nWhat are the objects in this image?\nWhat is the composition of this image?\nWhat is the color palette in this image?\nWhat is the lighting in this image?",
+                        "multiline": True,
+                    },
+                ),
+                "temperature": (
+                    "FLOAT",
+                    {
+                        "default": 0.8,
+                        "min": 0.1,
+                        "max": 2.0,
+                        "step": 0.01,
+                        "display": "slider",
+                    },
+                ),
+                "repetition_penalty": (
+                    "FLOAT",
+                    {
+                        "default": 1.2,
+                        "min": 0.1,
+                        "max": 2.0,
+                        "step": 0.01,
+                        "display": "slider",
+                    },
+                ),
+                "min_words": ("INT", {"default": 36}),
+                "max_words": ("INT", {"default": 128}),
+                "search_beams": ("INT", {"default": 5}),
+                "exclude_terms": (
+                    "STRING",
+                    {
+                        "default": "watermark, text, writing",
+                    },
+                ),
+            },
+            "hidden": {
+                "unique_id": "UNIQUE_ID",
+                "extra_pnginfo": "EXTRA_PNGINFO",
+                "output_text": (
+                    "STRING",
+                    {
+                        "default": "",
+                    },
+                ),
+            },
+        }
+    RETURN_TYPES = ("STRING",)
+    RETURN_NAMES = ("caption",)
+    FUNCTION = "main"
+    OUTPUT_NODE = True
+    def main(
+        self,
+        input_image: torch.Tensor,  # [Batch_n, H, W, 3-channel]
+        use_blip_model: bool,
+        use_llava_model: bool,
+        use_all_models: bool,
+        use_mini_pcm_model: bool,
+        blip_caption_prefix: str,
+        prompt_questions: str,
+        temperature: float,
+        repetition_penalty: float,
+        min_words: int,
+        max_words: int,
+        search_beams: int,
+        exclude_terms: str,
+        output_text: str = "",
+        unique_id=None,
+        extra_pnginfo=None,
+    ) -> Tuple[str, ...]:
+        raw_image = transforms.ToPILImage()(
+            TensorImgUtils.convert_to_type(input_image, "CHW")
+        ).convert("RGB")
+        if blip_caption_prefix == "":
+            blip_caption_prefix = "a photograph of"
+        captions = []
+        if use_all_models or use_blip_model:
+            blip_model_path = folder_paths.get_folder_paths("blip")[0]
+            print(f"blip_model_path: {blip_model_path}")
+            if not blip_model_path or not os.path.exists(blip_model_path):
+                raise ValueError("BLIP model 'blip-image-captioning-large' not found in ComfyUI models directory. Please ensure it's in the 'models/blip' folder.")
+            blip = BLIPImg2Txt(
+                conditional_caption=blip_caption_prefix,
+                min_words=min_words,
+                max_words=max_words,
+                temperature=temperature,
+                repetition_penalty=repetition_penalty,
+                search_beams=search_beams,
+                custom_model_path=blip_model_path
+            )
+            captions.append(blip.generate_caption(raw_image))
+        if use_all_models or use_llava_model:
+            llava_questions = prompt_questions.split("\n")
+            llava_questions = [
+                q
+                for q in llava_questions
+                if q != "" and q != " " and q != "\n" and q != "\n\n"
+            ]
+            if len(llava_questions) > 0:
+                llava = LlavaImg2Txt(
+                    question_list=llava_questions,
+                    model_id="llava-hf/llava-1.5-7b-hf",
+                    use_4bit_quantization=True,
+                    use_low_cpu_mem=True,
+                    use_flash2_attention=False,
+                    max_tokens_per_chunk=300,
+                )
+                captions.append(llava.generate_caption(raw_image))
+        if use_all_models or use_mini_pcm_model:
+            mini_pcm = MiniPCMImg2Txt(
+                question_list=prompt_questions.split("\n"),
+                temperature=temperature,
+            )
+            captions.append(mini_pcm.generate_captions(raw_image))
+        out_string = self.exclude(exclude_terms, self.merge_captions(captions))
+        return {"ui": {"text": out_string}, "result": (out_string,)}
+    def merge_captions(self, captions: list) -> str:
+        """Merge captions from multiple models into one string.
+        Necessary because we can expect the generated captions will generally
+        be comma-separated fragments ordered by relevance - so combine
+        fragments in an alternating order."""
+        merged_caption = ""
+        captions = [c.split(",") for c in captions]
+        for i in range(max(len(c) for c in captions)):
+            for j in range(len(captions)):
+                if i < len(captions[j]) and captions[j][i].strip() != "":
+                    merged_caption += captions[j][i].strip() + ", "
+        return merged_caption
+    def exclude(self, exclude_terms: str, out_string: str) -> str:
+        # https://huggingface.co/Salesforce/blip-image-captioning-large/discussions/20
+        exclude_terms = "arafed," + exclude_terms
+        exclude_terms = [
+            term.strip().lower() for term in exclude_terms.split(",") if term != ""
+        ]
+        for term in exclude_terms:
+            out_string = out_string.replace(term, "")
+        return out_string

ComfyUI/custom_nodes/img2txt-comfyui-nodes/src/img_tensor_utils.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import torch
+from typing import Tuple
+class TensorImgUtils:
+    @staticmethod
+    def from_to(from_type: list[str], to_type: list[str]):
+        """Return a function that converts a tensor from one type to another. Args can be lists of strings or just strings (e.g., ["C", "H", "W"] or just "CHW")."""
+        if isinstance(from_type, list):
+            from_type = "".join(from_type)
+        if isinstance(to_type, list):
+            to_type = "".join(to_type)
+        permute_arg = [from_type.index(c) for c in to_type]
+        def convert(tensor: torch.Tensor) -> torch.Tensor:
+            return tensor.permute(permute_arg)
+        return convert
+    @staticmethod
+    def convert_to_type(tensor: torch.Tensor, to_type: str) -> torch.Tensor:
+        """Convert a tensor to a specific type."""
+        from_type = TensorImgUtils.identify_type(tensor)[0]
+        if from_type == list(to_type):
+            return tensor
+        if len(from_type) == 4 and len(to_type) == 3:
+            # If converting from a batched tensor to a non-batched tensor, squeeze the batch dimension
+            tensor = tensor.squeeze(0)
+            from_type = from_type[1:]
+        if len(from_type) == 3 and len(to_type) == 4:
+            # If converting from a non-batched tensor to a batched tensor, unsqueeze the batch dimension
+            tensor = tensor.unsqueeze(0)
+            from_type = ["B"] + from_type
+        return TensorImgUtils.from_to(from_type, list(to_type))(tensor)
+    @staticmethod
+    def identify_type(tensor: torch.Tensor) -> Tuple[list[str], str]:
+        """Identify the type of image tensor. Doesn't currently check for BHW. Returns one of the following:"""
+        dim_n = tensor.dim()
+        if dim_n == 2:
+            return (["H", "W"], "HW")
+        elif dim_n == 3:  # HWA, AHW, HWC, or CHW
+            if tensor.size(2) == 3:
+                return (["H", "W", "C"], "HWRGB")
+            elif tensor.size(2) == 4:
+                return (["H", "W", "C"], "HWRGBA")
+            elif tensor.size(0) == 3:
+                return (["C", "H", "W"], "RGBHW")
+            elif tensor.size(0) == 4:
+                return (["C", "H", "W"], "RGBAHW")
+            elif tensor.size(2) == 1:
+                return (["H", "W", "C"], "HWA")
+            elif tensor.size(0) == 1:
+                return (["C", "H", "W"], "AHW")
+        elif dim_n == 4:  # BHWC or BCHW
+            if tensor.size(3) >= 3:  # BHWRGB or BHWRGBA
+                if tensor.size(3) == 3:
+                    return (["B", "H", "W", "C"], "BHWRGB")
+                elif tensor.size(3) == 4:
+                    return (["B", "H", "W", "C"], "BHWRGBA")
+            elif tensor.size(1) >= 3:
+                if tensor.size(1) == 3:
+                    return (["B", "C", "H", "W"], "BRGBHW")
+                elif tensor.size(1) == 4:
+                    return (["B", "C", "H", "W"], "BRGBAHW")
+        else:
+            raise ValueError(
+                f"{dim_n} dimensions is not a valid number of dimensions for an image tensor."
+            )
+        raise ValueError(
+            f"Could not determine shape of Tensor with {dim_n} dimensions and {tensor.shape} shape."
+        )
+    @staticmethod
+    def test_squeeze_batch(tensor: torch.Tensor, strict=False) -> torch.Tensor:
+        # Check if the tensor has a batch dimension (size 4)
+        if tensor.dim() == 4:
+            if tensor.size(0) == 1 or not strict:
+                # If it has a batch dimension with size 1, remove it. It represents a single image.
+                return tensor.squeeze(0)
+            else:
+                raise ValueError(
+                    f"This is not a single image. It's a batch of {tensor.size(0)} images."
+                )
+        else:
+            # Otherwise, it doesn't have a batch dimension, so just return the tensor as is.
+            return tensor
+    @staticmethod
+    def test_unsqueeze_batch(tensor: torch.Tensor) -> torch.Tensor:
+        # Check if the tensor has a batch dimension (size 4)
+        if tensor.dim() == 3:
+            # If it doesn't have a batch dimension, add one. It represents a single image.
+            return tensor.unsqueeze(0)
+        else:
+            # Otherwise, it already has a batch dimension, so just return the tensor as is.
+            return tensor
+    @staticmethod
+    def most_pixels(img_tensors: list[torch.Tensor]) -> torch.Tensor:
+        sizes = [
+            TensorImgUtils.height_width(img)[0] * TensorImgUtils.height_width(img)[1]
+            for img in img_tensors
+        ]
+        return img_tensors[sizes.index(max(sizes))]
+    @staticmethod
+    def height_width(image: torch.Tensor) -> Tuple[int, int]:
+        """Like torchvision.transforms methods, this method assumes Tensor to
+        have [..., H, W] shape, where ... means an arbitrary number of leading
+        dimensions
+        """
+        return image.shape[-2:]
+    @staticmethod
+    def smaller_axis(image: torch.Tensor) -> int:
+        h, w = TensorImgUtils.height_width(image)
+        return 2 if h < w else 3
+    @staticmethod
+    def larger_axis(image: torch.Tensor) -> int:
+        h, w = TensorImgUtils.height_width(image)
+        return 2 if h > w else 3

ComfyUI/custom_nodes/img2txt-comfyui-nodes/src/keyword_extract.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import torch
+from transformers import AutoTokenizer, AutoModelForTokenClassification
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+from nltk import pos_tag
+from nltk.tokenize import word_tokenize
+import nltk
+def nltk_speach_tag(sentence):
+    nltk.download("punkt")
+    nltk.download("averaged_perceptron_tagger")
+    nltk.download("stopwords")
+    # Tokenize the sentence
+    tokens = word_tokenize(sentence)
+    # Filter out stopwords and punctuation
+    stop_words = set(stopwords.words("english"))
+    filtered_tokens = [
+        word for word in tokens if word.lower() not in stop_words and word.isalnum()
+    ]
+    # Perform Part-of-Speech tagging
+    tagged_tokens = pos_tag(filtered_tokens)
+    # Extract nouns and proper nouns
+    salient_tokens = [
+        token
+        for token, pos in tagged_tokens
+        if pos in ["NN", "NNP", "NNS", "NNPS", "ADJ", "JJ", "FW"]
+    ]
+    salient_tokens = list(set(salient_tokens))
+    # Re-add commas or periods relative to the original sentence
+    comma_period_indices = [i for i, char in enumerate(sentence) if char in [",", "."]]
+    salient_tokens_indices = [sentence.index(token) for token in salient_tokens]
+    # Add commas or periods between words if there was one in the original sentence
+    out = ""
+    for i, index in enumerate(salient_tokens_indices):
+        out += salient_tokens[i]
+        distance_between_next = (
+            salient_tokens_indices[i + 1] - index
+            if i + 1 < len(salient_tokens_indices)
+            else None
+        )
+        puncuated = False
+        if not distance_between_next:
+            puncuated = True
+        else:
+            for i in range(index, index + distance_between_next):
+                if i in comma_period_indices:
+                    puncuated = True
+                    break
+        if not puncuated:
+            # IF the previous word was an adjective, and current is a noun, add a space
+            if (
+                i > 0
+                and tagged_tokens[i - 1][1] in ["JJ", "ADJ"]
+                and tagged_tokens[i][1] in ["NN", "NNP", "NNS", "NNPS"]
+            ):
+                out += " "
+            else:
+                out += ", "
+        else:
+            out += ". "
+    # Add the last token
+    out += sentence[-1]
+    # Print the salient tokens
+    return out.strip().strip(",").strip(".").strip()
+def extract_keywords(text: str) -> str:
+    tokenizer = AutoTokenizer.from_pretrained("yanekyuk/bert-keyword-extractor")
+    model = AutoModelForTokenClassification.from_pretrained(
+        "yanekyuk/bert-keyword-extractor"
+    )
+    """Return keywords from text using a BERT model trained for keyword extraction as
+    a comma-separated string."""
+    print(f"Extracting keywords from text: {text}")
+    for char in ["\n", "\t", "\r"]:
+        text = text.replace(char, " ")
+    sentences = text.split(".")
+    result = ""
+    for sentence in sentences:
+        print(f"Extracting keywords from sentence: {sentence}")
+        inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
+        with torch.no_grad():
+            logits = model(**inputs).logits
+        predicted_token_class_ids = logits.argmax(dim=-1)
+        predicted_keywords = []
+        for token_id, token in zip(
+            predicted_token_class_ids[0],
+            tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]),
+        ):
+            if token_id == 1:
+                predicted_keywords.append(token)
+        print(f"Extracted keywords: {predicted_keywords}")
+        result += ", ".join(predicted_keywords) + ", "
+    print(f"All Keywords: {result}")
+    return result

ComfyUI/custom_nodes/img2txt-comfyui-nodes/src/llava_img2txt.py ADDED Viewed

	@@ -0,0 +1,131 @@

+from PIL import Image
+import torch
+import model_management
+from transformers import AutoProcessor, LlavaForConditionalGeneration, BitsAndBytesConfig
+class LlavaImg2Txt:
+    """
+    A class to generate text captions for images using the Llava model.
+    Args:
+        question_list (list[str]): A list of questions to ask the model about the image.
+        model_id (str): The model's name in the Hugging Face model hub.
+        use_4bit_quantization (bool): Whether to use 4-bit quantization to reduce memory usage. 4-bit quantization reduces the precision of model parameters, potentially affecting the quality of generated outputs. Use if VRAM is limited. Default is True.
+        use_low_cpu_mem (bool): In low_cpu_mem_usage mode, the model is initialized with optimizations aimed at reducing CPU memory consumption. This can be beneficial when working with large models or limited computational resources. Default is True.
+        use_flash2_attention (bool): Whether to use Flash-Attention 2. Flash-Attention 2 focuses on optimizing attention mechanisms, which are crucial for the model's performance during generation. Use if computational resources are abundant. Default is False.
+        max_tokens_per_chunk (int): The maximum number of tokens to generate per prompt chunk. Default is 300.
+    """
+    def __init__(
+        self,
+        question_list,
+        model_id: str = "llava-hf/llava-1.5-7b-hf",
+        use_4bit_quantization: bool = True,
+        use_low_cpu_mem: bool = True,
+        use_flash2_attention: bool = False,
+        max_tokens_per_chunk: int = 300,
+    ):
+        self.question_list = question_list
+        self.model_id = model_id
+        self.use_4bit = use_4bit_quantization
+        self.use_flash2 = use_flash2_attention
+        self.use_low_cpu_mem = use_low_cpu_mem
+        self.max_tokens_per_chunk = max_tokens_per_chunk
+    def generate_caption(
+        self,
+        raw_image: Image.Image,
+    ) -> str:
+        """
+        Generate a caption for an image using the Llava model.
+        Args:
+            raw_image (Image): Image to generate caption for
+        """
+        # Convert Image to RGB first
+        if raw_image.mode != "RGB":
+            raw_image = raw_image.convert("RGB")
+        dtype = torch.float16
+        quant_config = BitsAndBytesConfig(
+            load_in_4bit=self.use_4bit,
+            bnb_4bit_compute_dtype=dtype,
+            bnb_4bit_quant_type="fp4"
+        )
+        model = LlavaForConditionalGeneration.from_pretrained(
+            self.model_id,
+            torch_dtype=dtype,
+            low_cpu_mem_usage=self.use_low_cpu_mem,
+            use_flash_attention_2=self.use_flash2,
+            quantization_config=quant_config,
+        )
+        # model.to() is not supported for 4-bit or 8-bit bitsandbytes models. With 4-bit quantization, use the model as it is, since the model will already be set to the correct devices and casted to the correct `dtype`.
+        if torch.cuda.is_available() and not self.use_4bit:
+            model = model.to(model_management.get_torch_device(), torch.float16)
+        processor = AutoProcessor.from_pretrained(self.model_id)
+        prompt_chunks = self.__get_prompt_chunks(chunk_size=4)
+        caption = ""
+        with torch.no_grad():
+            for prompt_list in prompt_chunks:
+                prompt = self.__get_single_answer_prompt(prompt_list)
+                inputs = processor(prompt, raw_image, return_tensors="pt").to(
+                    model_management.get_torch_device(), torch.float16
+                )
+                output = model.generate(
+                    **inputs, max_new_tokens=self.max_tokens_per_chunk, do_sample=False
+                )
+                decoded = processor.decode(output[0][2:])
+                cleaned = self.clean_output(decoded)
+                caption += cleaned
+        del model
+        torch.cuda.empty_cache()
+        return caption
+    def clean_output(self, decoded_output, delimiter=","):
+        output_only = decoded_output.split("ASSISTANT: ")[1]
+        lines = output_only.split("\n")
+        cleaned_output = ""
+        for line in lines:
+            cleaned_output += self.__replace_delimiter(line, ".", delimiter)
+        return cleaned_output
+    def __get_single_answer_prompt(self, questions):
+        """
+        For multiple turns conversation:
+        "USER: <image>\n<prompt1> ASSISTANT: <answer1></s>USER: <prompt2> ASSISTANT: <answer2></s>USER: <prompt3> ASSISTANT:"
+        From: https://huggingface.co/docs/transformers/en/model_doc/llava#usage-tips
+        Not sure how the formatting works for multi-turn but those are the docs.
+        """
+        prompt = "USER: <image>\n"
+        for index, question in enumerate(questions):
+            if index != 0:
+                prompt += "USER: "
+            prompt += f"{question} </s >"
+        prompt += "ASSISTANT: "
+        return prompt
+    def __replace_delimiter(self, text: str, old, new=","):
+        """Replace only the LAST instance of old with new"""
+        if old not in text:
+            return text.strip() + " "
+        last_old_index = text.rindex(old)
+        replaced = text[:last_old_index] + new + text[last_old_index + len(old) :]
+        return replaced.strip() + " "
+    def __get_prompt_chunks(self, chunk_size=4):
+        prompt_chunks = []
+        for index, feature in enumerate(self.question_list):
+            if index % chunk_size == 0:
+                prompt_chunks.append([feature])
+            else:
+                prompt_chunks[-1].append(feature)
+        return prompt_chunks

ComfyUI/custom_nodes/img2txt-comfyui-nodes/src/mini_cpm_img2txt.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import torch
+from PIL import Image
+from transformers import AutoModel, AutoTokenizer
+import model_management
+class MiniPCMImg2Txt:
+    def __init__(self, question_list: list[str], temperature: float = 0.7):
+        self.model_id = "openbmb/MiniCPM-V-2"
+        self.question_list = question_list
+        self.question_list = self.__create_question_list()
+        self.temperature = temperature
+    def __create_question_list(self) -> list:
+        ret = []
+        for q in self.question_list:
+            ret.append({"role": "user", "content": q})
+        return ret
+    def generate_captions(self, raw_image: Image.Image) -> str:
+        device = model_management.get_torch_device()
+        # For Nvidia GPUs support BF16 (like A100, H100, RTX3090)
+        # For Nvidia GPUs do NOT support BF16 (like V100, T4, RTX2080)
+        torch_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
+        model = AutoModel.from_pretrained(
+            "openbmb/MiniCPM-V-2", trust_remote_code=True, torch_dtype=torch_dtype
+        )
+        model = model.to(device=device, dtype=torch_dtype)
+        tokenizer = AutoTokenizer.from_pretrained(
+            self.model_id, trust_remote_code=True
+        )
+        model.eval()
+        if raw_image.mode != "RGB":
+            raw_image = raw_image.convert("RGB")
+        with torch.no_grad():
+            res, _, _ = model.chat(
+                image=raw_image,
+                msgs=self.question_list,
+                context=None,
+                tokenizer=tokenizer,
+                sampling=True,
+                temperature=self.temperature,
+            )
+        del model
+        torch.cuda.empty_cache()
+        return res

ComfyUI/custom_nodes/img2txt-comfyui-nodes/web/show-output-text.js ADDED Viewed

	@@ -0,0 +1,51 @@

+import { app } from "../../../scripts/app.js";
+import { ComfyWidgets } from "../../../scripts/widgets.js";
+// Displays output caption text
+app.registerExtension({
+  name: "Img2TxtNode",
+  async beforeRegisterNodeDef(nodeType, nodeData, app) {
+    if (nodeData.name === "img2txt BLIP/Llava Multimodel Tagger") {
+      function populate(message) {
+        console.log("message", message);
+        console.log("message.text", message.text);
+        const insertIndex = this.widgets.findIndex((w) => w.name === "output_text");
+        if (insertIndex !== -1) {
+          for (let i = insertIndex; i < this.widgets.length; i++) {
+            this.widgets[i].onRemove?.();
+          }
+          this.widgets.length = insertIndex;
+        }
+        const outputWidget = ComfyWidgets["STRING"](
+          this,
+          "output_text",
+          ["STRING", { multiline: true }],
+          app
+        ).widget;
+        outputWidget.inputEl.readOnly = true;
+        outputWidget.inputEl.style.opacity = 0.6;
+        outputWidget.value = message.text.join("");
+        requestAnimationFrame(() => {
+          const size_ = this.computeSize();
+          if (size_[0] < this.size[0]) {
+            size_[0] = this.size[0];
+          }
+          if (size_[1] < this.size[1]) {
+            size_[1] = this.size[1];
+          }
+          this.onResize?.(size_);
+          app.graph.setDirtyCanvas(true, false);
+        });
+      }
+      const onExecuted = nodeType.prototype.onExecuted;
+      nodeType.prototype.onExecuted = function (message) {
+        onExecuted?.apply(this, arguments);
+        populate.call(this, message);
+      };
+    }
+  },
+});

ComfyUI/custom_nodes/img2txt-comfyui-nodes/wiki/demo-pics/Selection_001.png ADDED Viewed

Git LFS Details

SHA256: c71dc3dab484d9362680510fbbfe725e0cd988e0575b79acf339e7296faedb3a
Pointer size: 133 Bytes
Size of remote file: 13.7 MB

ComfyUI/custom_nodes/img2txt-comfyui-nodes/wiki/demo-pics/Selection_002.png ADDED Viewed

Git LFS Details

SHA256: c2db46defa1b80a63256d4f0d85dc010e6950ae30f56b15c86bd1871469d4783
Pointer size: 133 Bytes
Size of remote file: 13.8 MB

ComfyUI/custom_nodes/img2txt-comfyui-nodes/wiki/demo-pics/Selection_003.png ADDED Viewed

Git LFS Details

SHA256: 006a87fa5d86a9addc5953a3c9f6fd20b9bbf06efe328a9412bd5277bfd4aeb5
Pointer size: 132 Bytes
Size of remote file: 9.54 MB

ComfyUI/custom_nodes/img2txt-comfyui-nodes/wiki/workflow-examples/img2img.json ADDED Viewed

	@@ -0,0 +1,523 @@

+{
+  "last_node_id": 51,
+  "last_link_id": 60,
+  "nodes": [
+    {
+      "id": 41,
+      "type": "CLIPTextEncode",
+      "pos": [
+        1055,
+        571
+      ],
+      "size": {
+        "0": 348.9403381347656,
+        "1": 56.439388275146484
+      },
+      "flags": {},
+      "order": 5,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "clip",
+          "type": "CLIP",
+          "link": 50
+        },
+        {
+          "name": "text",
+          "type": "STRING",
+          "link": 60,
+          "widget": {
+            "name": "text"
+          }
+        }
+      ],
+      "outputs": [
+        {
+          "name": "CONDITIONING",
+          "type": "CONDITIONING",
+          "links": [
+            44
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CLIPTextEncode"
+      },
+      "widgets_values": [
+        ""
+      ]
+    },
+    {
+      "id": 39,
+      "type": "KSampler",
+      "pos": [
+        1587,
+        982
+      ],
+      "size": {
+        "0": 315,
+        "1": 262
+      },
+      "flags": {},
+      "order": 6,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "model",
+          "type": "MODEL",
+          "link": 42
+        },
+        {
+          "name": "positive",
+          "type": "CONDITIONING",
+          "link": 44
+        },
+        {
+          "name": "negative",
+          "type": "CONDITIONING",
+          "link": 45
+        },
+        {
+          "name": "latent_image",
+          "type": "LATENT",
+          "link": 58
+        }
+      ],
+      "outputs": [
+        {
+          "name": "LATENT",
+          "type": "LATENT",
+          "links": [
+            48
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "KSampler"
+      },
+      "widgets_values": [
+        290872458059323,
+        "randomize",
+        20,
+        8,
+        "euler",
+        "normal",
+        1
+      ]
+    },
+    {
+      "id": 45,
+      "type": "VAEDecode",
+      "pos": [
+        1998,
+        1018
+      ],
+      "size": {
+        "0": 210,
+        "1": 46
+      },
+      "flags": {},
+      "order": 7,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "link": 48
+        },
+        {
+          "name": "vae",
+          "type": "VAE",
+          "link": 49
+        }
+      ],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            55
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "VAEDecode"
+      }
+    },
+    {
+      "id": 48,
+      "type": "PreviewImage",
+      "pos": [
+        2039,
+        1262
+      ],
+      "size": {
+        "0": 210,
+        "1": 246
+      },
+      "flags": {},
+      "order": 8,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "link": 55
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "PreviewImage"
+      }
+    },
+    {
+      "id": 42,
+      "type": "CLIPTextEncode",
+      "pos": [
+        1056,
+        683
+      ],
+      "size": {
+        "0": 352.9139404296875,
+        "1": 113.16606140136719
+      },
+      "flags": {},
+      "order": 3,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "clip",
+          "type": "CLIP",
+          "link": 51
+        }
+      ],
+      "outputs": [
+        {
+          "name": "CONDITIONING",
+          "type": "CONDITIONING",
+          "links": [
+            45
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CLIPTextEncode"
+      },
+      "widgets_values": [
+        "text, watermark"
+      ]
+    },
+    {
+      "id": 50,
+      "type": "VAEEncode",
+      "pos": [
+        1119,
+        1329
+      ],
+      "size": {
+        "0": 201.4841766357422,
+        "1": 55.59581756591797
+      },
+      "flags": {},
+      "order": 4,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "pixels",
+          "type": "IMAGE",
+          "link": 56
+        },
+        {
+          "name": "vae",
+          "type": "VAE",
+          "link": 57
+        }
+      ],
+      "outputs": [
+        {
+          "name": "LATENT",
+          "type": "LATENT",
+          "links": [
+            58
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "VAEEncode"
+      }
+    },
+    {
+      "id": 11,
+      "type": "LoadImage",
+      "pos": [
+        -135,
+        907
+      ],
+      "size": {
+        "0": 670,
+        "1": 460
+      },
+      "flags": {},
+      "order": 0,
+      "mode": 0,
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            56,
+            59
+          ],
+          "shape": 3,
+          "slot_index": 0
+        },
+        {
+          "name": "MASK",
+          "type": "MASK",
+          "links": [],
+          "shape": 3,
+          "slot_index": 1
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadImage"
+      },
+      "widgets_values": [
+        "example.png",
+        "image"
+      ]
+    },
+    {
+      "id": 40,
+      "type": "CheckpointLoaderSimple",
+      "pos": [
+        1124,
+        1019
+      ],
+      "size": {
+        "0": 315,
+        "1": 98
+      },
+      "flags": {},
+      "order": 1,
+      "mode": 0,
+      "outputs": [
+        {
+          "name": "MODEL",
+          "type": "MODEL",
+          "links": [
+            42
+          ],
+          "shape": 3,
+          "slot_index": 0
+        },
+        {
+          "name": "CLIP",
+          "type": "CLIP",
+          "links": [
+            50,
+            51
+          ],
+          "shape": 3,
+          "slot_index": 1
+        },
+        {
+          "name": "VAE",
+          "type": "VAE",
+          "links": [
+            49,
+            57
+          ],
+          "shape": 3,
+          "slot_index": 2
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CheckpointLoaderSimple"
+      },
+      "widgets_values": [
+        "dreamshaper_8.safetensors"
+      ]
+    },
+    {
+      "id": 51,
+      "type": "img2txt BLIP/Llava Multimodel Tagger",
+      "pos": [
+        605,
+        881
+      ],
+      "size": {
+        "0": 427.2057800292969,
+        "1": 476.26934814453125
+      },
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "input_image",
+          "type": "IMAGE",
+          "link": 59
+        }
+      ],
+      "outputs": [
+        {
+          "name": "caption",
+          "type": "STRING",
+          "links": [
+            60
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "img2txt BLIP/Llava Multimodel Tagger"
+      },
+      "widgets_values": [
+        true,
+        false,
+        false,
+        false,
+        "a photograph of",
+        "What is the subject and background of this image?",
+        0.7000000000000001,
+        1.26,
+        36,
+        128,
+        5,
+        "watermark, text, writing",
+        "a photograph of a girl dressed up, in pink dress and bright blue eyes poses in the grass with arms spread out in front of her face, holding an umbrella on a sky, "
+      ],
+      "color": "#322",
+      "bgcolor": "#533"
+    }
+  ],
+  "links": [
+    [
+      42,
+      40,
+      0,
+      39,
+      0,
+      "MODEL"
+    ],
+    [
+      44,
+      41,
+      0,
+      39,
+      1,
+      "CONDITIONING"
+    ],
+    [
+      45,
+      42,
+      0,
+      39,
+      2,
+      "CONDITIONING"
+    ],
+    [
+      48,
+      39,
+      0,
+      45,
+      0,
+      "LATENT"
+    ],
+    [
+      49,
+      40,
+      2,
+      45,
+      1,
+      "VAE"
+    ],
+    [
+      50,
+      40,
+      1,
+      41,
+      0,
+      "CLIP"
+    ],
+    [
+      51,
+      40,
+      1,
+      42,
+      0,
+      "CLIP"
+    ],
+    [
+      55,
+      45,
+      0,
+      48,
+      0,
+      "IMAGE"
+    ],
+    [
+      56,
+      11,
+      0,
+      50,
+      0,
+      "IMAGE"
+    ],
+    [
+      57,
+      40,
+      2,
+      50,
+      1,
+      "VAE"
+    ],
+    [
+      58,
+      50,
+      0,
+      39,
+      3,
+      "LATENT"
+    ],
+    [
+      59,
+      11,
+      0,
+      51,
+      0,
+      "IMAGE"
+    ],
+    [
+      60,
+      51,
+      0,
+      41,
+      1,
+      "STRING"
+    ]
+  ],
+  "groups": [],
+  "config": {},
+  "extra": {
+    "ds": {
+      "scale": 0.9090909090909091,
+      "offset": {
+        "0": 304.575645264068,
+        "1": -258.56908735931404
+      }
+    }
+  },
+  "version": 0.4
+}

ComfyUI/custom_nodes/img2txt-comfyui-nodes/wiki/workflow-examples/inpaint.json ADDED Viewed

	@@ -0,0 +1,705 @@

+{
+  "last_node_id": 61,
+  "last_link_id": 80,
+  "nodes": [
+    {
+      "id": 45,
+      "type": "VAEDecode",
+      "pos": [
+        1998,
+        1018
+      ],
+      "size": {
+        "0": 210,
+        "1": 46
+      },
+      "flags": {},
+      "order": 10,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "link": 71
+        },
+        {
+          "name": "vae",
+          "type": "VAE",
+          "link": 49
+        }
+      ],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            55
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "VAEDecode"
+      }
+    },
+    {
+      "id": 42,
+      "type": "CLIPTextEncode",
+      "pos": [
+        1056,
+        683
+      ],
+      "size": {
+        "0": 352.9139404296875,
+        "1": 113.16606140136719
+      },
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "clip",
+          "type": "CLIP",
+          "link": 51
+        }
+      ],
+      "outputs": [
+        {
+          "name": "CONDITIONING",
+          "type": "CONDITIONING",
+          "links": [
+            63
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CLIPTextEncode"
+      },
+      "widgets_values": [
+        "text, watermark"
+      ]
+    },
+    {
+      "id": 41,
+      "type": "CLIPTextEncode",
+      "pos": [
+        1055,
+        571
+      ],
+      "size": {
+        "0": 348.9403381347656,
+        "1": 56.439388275146484
+      },
+      "flags": {},
+      "order": 6,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "clip",
+          "type": "CLIP",
+          "link": 50
+        },
+        {
+          "name": "text",
+          "type": "STRING",
+          "link": 80,
+          "widget": {
+            "name": "text"
+          }
+        }
+      ],
+      "outputs": [
+        {
+          "name": "CONDITIONING",
+          "type": "CONDITIONING",
+          "links": [
+            64
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CLIPTextEncode"
+      },
+      "widgets_values": [
+        ""
+      ]
+    },
+    {
+      "id": 58,
+      "type": "PreviewImage",
+      "pos": [
+        616,
+        1631
+      ],
+      "size": {
+        "0": 401.17840576171875,
+        "1": 246
+      },
+      "flags": {},
+      "order": 7,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "link": 73
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "PreviewImage"
+      }
+    },
+    {
+      "id": 57,
+      "type": "MaskToImage",
+      "pos": [
+        617,
+        1543
+      ],
+      "size": {
+        "0": 210,
+        "1": 26
+      },
+      "flags": {},
+      "order": 5,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "mask",
+          "type": "MASK",
+          "link": 78
+        }
+      ],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            73
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "MaskToImage"
+      }
+    },
+    {
+      "id": 40,
+      "type": "CheckpointLoaderSimple",
+      "pos": [
+        1044,
+        1032
+      ],
+      "size": {
+        "0": 315,
+        "1": 98
+      },
+      "flags": {},
+      "order": 0,
+      "mode": 0,
+      "outputs": [
+        {
+          "name": "MODEL",
+          "type": "MODEL",
+          "links": [
+            68
+          ],
+          "shape": 3,
+          "slot_index": 0
+        },
+        {
+          "name": "CLIP",
+          "type": "CLIP",
+          "links": [
+            50,
+            51
+          ],
+          "shape": 3,
+          "slot_index": 1
+        },
+        {
+          "name": "VAE",
+          "type": "VAE",
+          "links": [
+            49,
+            69
+          ],
+          "shape": 3,
+          "slot_index": 2
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CheckpointLoaderSimple"
+      },
+      "widgets_values": [
+        "experience_70-inpainting.safetensors"
+      ]
+    },
+    {
+      "id": 48,
+      "type": "PreviewImage",
+      "pos": [
+        2039,
+        1262
+      ],
+      "size": {
+        "0": 295.2332458496094,
+        "1": 293.2945251464844
+      },
+      "flags": {},
+      "order": 11,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "link": 55
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "PreviewImage"
+      }
+    },
+    {
+      "id": 56,
+      "type": "KSampler",
+      "pos": [
+        1642,
+        820
+      ],
+      "size": {
+        "0": 315,
+        "1": 262
+      },
+      "flags": {},
+      "order": 9,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "model",
+          "type": "MODEL",
+          "link": 68
+        },
+        {
+          "name": "positive",
+          "type": "CONDITIONING",
+          "link": 66
+        },
+        {
+          "name": "negative",
+          "type": "CONDITIONING",
+          "link": 67
+        },
+        {
+          "name": "latent_image",
+          "type": "LATENT",
+          "link": 65
+        }
+      ],
+      "outputs": [
+        {
+          "name": "LATENT",
+          "type": "LATENT",
+          "links": [
+            71
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "KSampler"
+      },
+      "widgets_values": [
+        492464952856155,
+        "randomize",
+        30,
+        7,
+        "dpmpp_2m_sde_gpu",
+        "normal",
+        0.8
+      ]
+    },
+    {
+      "id": 55,
+      "type": "ImageColorToMask",
+      "pos": [
+        610,
+        1425
+      ],
+      "size": {
+        "0": 315,
+        "1": 58
+      },
+      "flags": {},
+      "order": 3,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "image",
+          "type": "IMAGE",
+          "link": 61
+        }
+      ],
+      "outputs": [
+        {
+          "name": "MASK",
+          "type": "MASK",
+          "links": [
+            77,
+            78
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "ImageColorToMask"
+      },
+      "widgets_values": [
+        6198527
+      ]
+    },
+    {
+      "id": 54,
+      "type": "InpaintModelConditioning",
+      "pos": [
+        1289,
+        1377
+      ],
+      "size": {
+        "0": 216.59999084472656,
+        "1": 106
+      },
+      "flags": {},
+      "order": 8,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "positive",
+          "type": "CONDITIONING",
+          "link": 64
+        },
+        {
+          "name": "negative",
+          "type": "CONDITIONING",
+          "link": 63
+        },
+        {
+          "name": "vae",
+          "type": "VAE",
+          "link": 69
+        },
+        {
+          "name": "pixels",
+          "type": "IMAGE",
+          "link": 70
+        },
+        {
+          "name": "mask",
+          "type": "MASK",
+          "link": 77
+        }
+      ],
+      "outputs": [
+        {
+          "name": "positive",
+          "type": "CONDITIONING",
+          "links": [
+            66
+          ],
+          "shape": 3,
+          "slot_index": 0
+        },
+        {
+          "name": "negative",
+          "type": "CONDITIONING",
+          "links": [
+            67
+          ],
+          "shape": 3,
+          "slot_index": 1
+        },
+        {
+          "name": "latent",
+          "type": "LATENT",
+          "links": [
+            65
+          ],
+          "shape": 3,
+          "slot_index": 2
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "InpaintModelConditioning"
+      }
+    },
+    {
+      "id": 11,
+      "type": "LoadImage",
+      "pos": [
+        -135,
+        907
+      ],
+      "size": {
+        "0": 670,
+        "1": 460
+      },
+      "flags": {},
+      "order": 1,
+      "mode": 0,
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            61,
+            70,
+            79
+          ],
+          "shape": 3,
+          "slot_index": 0
+        },
+        {
+          "name": "MASK",
+          "type": "MASK",
+          "links": [],
+          "shape": 3,
+          "slot_index": 1
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadImage"
+      },
+      "widgets_values": [
+        "example.png",
+        "image"
+      ]
+    },
+    {
+      "id": 61,
+      "type": "img2txt BLIP/Llava Multimodel Tagger",
+      "pos": [
+        599,
+        886
+      ],
+      "size": [
+        414.8329491017887,
+        453.3791344354013
+      ],
+      "flags": {},
+      "order": 4,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "input_image",
+          "type": "IMAGE",
+          "link": 79
+        }
+      ],
+      "outputs": [
+        {
+          "name": "caption",
+          "type": "STRING",
+          "links": [
+            80
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "img2txt BLIP/Llava Multimodel Tagger"
+      },
+      "widgets_values": [
+        true,
+        false,
+        false,
+        false,
+        "a photograph of",
+        "What is the subject of this image?\n",
+        0.8,
+        1.2,
+        36,
+        128,
+        5,
+        "watermark, text, writing"
+      ],
+      "color": "#322",
+      "bgcolor": "#533"
+    }
+  ],
+  "links": [
+    [
+      49,
+      40,
+      2,
+      45,
+      1,
+      "VAE"
+    ],
+    [
+      50,
+      40,
+      1,
+      41,
+      0,
+      "CLIP"
+    ],
+    [
+      51,
+      40,
+      1,
+      42,
+      0,
+      "CLIP"
+    ],
+    [
+      55,
+      45,
+      0,
+      48,
+      0,
+      "IMAGE"
+    ],
+    [
+      61,
+      11,
+      0,
+      55,
+      0,
+      "IMAGE"
+    ],
+    [
+      63,
+      42,
+      0,
+      54,
+      1,
+      "CONDITIONING"
+    ],
+    [
+      64,
+      41,
+      0,
+      54,
+      0,
+      "CONDITIONING"
+    ],
+    [
+      65,
+      54,
+      2,
+      56,
+      3,
+      "LATENT"
+    ],
+    [
+      66,
+      54,
+      0,
+      56,
+      1,
+      "CONDITIONING"
+    ],
+    [
+      67,
+      54,
+      1,
+      56,
+      2,
+      "CONDITIONING"
+    ],
+    [
+      68,
+      40,
+      0,
+      56,
+      0,
+      "MODEL"
+    ],
+    [
+      69,
+      40,
+      2,
+      54,
+      2,
+      "VAE"
+    ],
+    [
+      70,
+      11,
+      0,
+      54,
+      3,
+      "IMAGE"
+    ],
+    [
+      71,
+      56,
+      0,
+      45,
+      0,
+      "LATENT"
+    ],
+    [
+      73,
+      57,
+      0,
+      58,
+      0,
+      "IMAGE"
+    ],
+    [
+      77,
+      55,
+      0,
+      54,
+      4,
+      "MASK"
+    ],
+    [
+      78,
+      55,
+      0,
+      57,
+      0,
+      "MASK"
+    ],
+    [
+      79,
+      11,
+      0,
+      61,
+      0,
+      "IMAGE"
+    ],
+    [
+      80,
+      61,
+      0,
+      41,
+      1,
+      "STRING"
+    ]
+  ],
+  "groups": [],
+  "config": {},
+  "extra": {
+    "ds": {
+      "scale": 0.8264462809917354,
+      "offset": {
+        "0": 478.9515963527572,
+        "1": -472.76124333876595
+      }
+    }
+  },
+  "version": 0.4
+}

ComfyUI/custom_nodes/img2txt-comfyui-nodes/wiki/workflow-examples/txt2img.json ADDED Viewed

	@@ -0,0 +1,498 @@

+{
+  "last_node_id": 53,
+  "last_link_id": 61,
+  "nodes": [
+    {
+      "id": 41,
+      "type": "CLIPTextEncode",
+      "pos": [
+        1055,
+        571
+      ],
+      "size": {
+        "0": 348.9403381347656,
+        "1": 56.439388275146484
+      },
+      "flags": {},
+      "order": 5,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "clip",
+          "type": "CLIP",
+          "link": 50
+        },
+        {
+          "name": "text",
+          "type": "STRING",
+          "link": 61,
+          "widget": {
+            "name": "text"
+          }
+        }
+      ],
+      "outputs": [
+        {
+          "name": "CONDITIONING",
+          "type": "CONDITIONING",
+          "links": [
+            44
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CLIPTextEncode"
+      },
+      "widgets_values": [
+        ""
+      ]
+    },
+    {
+      "id": 39,
+      "type": "KSampler",
+      "pos": [
+        1587,
+        982
+      ],
+      "size": {
+        "0": 315,
+        "1": 262
+      },
+      "flags": {},
+      "order": 6,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "model",
+          "type": "MODEL",
+          "link": 42
+        },
+        {
+          "name": "positive",
+          "type": "CONDITIONING",
+          "link": 44
+        },
+        {
+          "name": "negative",
+          "type": "CONDITIONING",
+          "link": 45
+        },
+        {
+          "name": "latent_image",
+          "type": "LATENT",
+          "link": 59
+        }
+      ],
+      "outputs": [
+        {
+          "name": "LATENT",
+          "type": "LATENT",
+          "links": [
+            48
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "KSampler"
+      },
+      "widgets_values": [
+        438454791536393,
+        "randomize",
+        20,
+        8,
+        "euler",
+        "normal",
+        1
+      ]
+    },
+    {
+      "id": 45,
+      "type": "VAEDecode",
+      "pos": [
+        1998,
+        1018
+      ],
+      "size": {
+        "0": 210,
+        "1": 46
+      },
+      "flags": {},
+      "order": 7,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "link": 48
+        },
+        {
+          "name": "vae",
+          "type": "VAE",
+          "link": 49
+        }
+      ],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            55
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "VAEDecode"
+      }
+    },
+    {
+      "id": 48,
+      "type": "PreviewImage",
+      "pos": [
+        2039,
+        1262
+      ],
+      "size": {
+        "0": 210,
+        "1": 246
+      },
+      "flags": {},
+      "order": 8,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "link": 55
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "PreviewImage"
+      }
+    },
+    {
+      "id": 42,
+      "type": "CLIPTextEncode",
+      "pos": [
+        1056,
+        683
+      ],
+      "size": {
+        "0": 352.9139404296875,
+        "1": 113.16606140136719
+      },
+      "flags": {},
+      "order": 4,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "clip",
+          "type": "CLIP",
+          "link": 51
+        }
+      ],
+      "outputs": [
+        {
+          "name": "CONDITIONING",
+          "type": "CONDITIONING",
+          "links": [
+            45
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CLIPTextEncode"
+      },
+      "widgets_values": [
+        "text, watermark"
+      ]
+    },
+    {
+      "id": 52,
+      "type": "EmptyLatentImage",
+      "pos": [
+        1126,
+        1189
+      ],
+      "size": {
+        "0": 315,
+        "1": 106
+      },
+      "flags": {},
+      "order": 0,
+      "mode": 0,
+      "outputs": [
+        {
+          "name": "LATENT",
+          "type": "LATENT",
+          "links": [
+            59
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "EmptyLatentImage"
+      },
+      "widgets_values": [
+        512,
+        512,
+        1
+      ]
+    },
+    {
+      "id": 11,
+      "type": "LoadImage",
+      "pos": [
+        -135,
+        907
+      ],
+      "size": {
+        "0": 670,
+        "1": 460
+      },
+      "flags": {},
+      "order": 1,
+      "mode": 0,
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            60
+          ],
+          "shape": 3,
+          "slot_index": 0
+        },
+        {
+          "name": "MASK",
+          "type": "MASK",
+          "links": [],
+          "shape": 3,
+          "slot_index": 1
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadImage"
+      },
+      "widgets_values": [
+        "example.png",
+        "image"
+      ]
+    },
+    {
+      "id": 40,
+      "type": "CheckpointLoaderSimple",
+      "pos": [
+        1124,
+        1019
+      ],
+      "size": {
+        "0": 315,
+        "1": 98
+      },
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "outputs": [
+        {
+          "name": "MODEL",
+          "type": "MODEL",
+          "links": [
+            42
+          ],
+          "shape": 3,
+          "slot_index": 0
+        },
+        {
+          "name": "CLIP",
+          "type": "CLIP",
+          "links": [
+            50,
+            51
+          ],
+          "shape": 3,
+          "slot_index": 1
+        },
+        {
+          "name": "VAE",
+          "type": "VAE",
+          "links": [
+            49
+          ],
+          "shape": 3,
+          "slot_index": 2
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CheckpointLoaderSimple"
+      },
+      "widgets_values": [
+        "dreamshaper_8.safetensors"
+      ]
+    },
+    {
+      "id": 53,
+      "type": "img2txt BLIP/Llava Multimodel Tagger",
+      "pos": [
+        584,
+        865
+      ],
+      "size": [
+        462.2727684830322,
+        532.8236759410865
+      ],
+      "flags": {},
+      "order": 3,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "input_image",
+          "type": "IMAGE",
+          "link": 60
+        }
+      ],
+      "outputs": [
+        {
+          "name": "caption",
+          "type": "STRING",
+          "links": [
+            61
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "img2txt BLIP/Llava Multimodel Tagger"
+      },
+      "widgets_values": [
+        false,
+        false,
+        true,
+        false,
+        "a photograph of",
+        "What is a detailed description of this image?\nWhat is the background of this image?",
+        0.8,
+        1.2,
+        36,
+        128,
+        5,
+        "watermark, text, writing",
+        "The image features a cartoon character standing against an abstract background consisting of green, blue, and white elements. The main focus is on the woman with bright yellow wings wearing pink attire while smiling at something off-frame in front of her that seems to be representing \"clouds\" or possibly another object within view but not clearly visible due to its distance from us as viewers., "
+      ],
+      "color": "#322",
+      "bgcolor": "#533"
+    }
+  ],
+  "links": [
+    [
+      42,
+      40,
+      0,
+      39,
+      0,
+      "MODEL"
+    ],
+    [
+      44,
+      41,
+      0,
+      39,
+      1,
+      "CONDITIONING"
+    ],
+    [
+      45,
+      42,
+      0,
+      39,
+      2,
+      "CONDITIONING"
+    ],
+    [
+      48,
+      39,
+      0,
+      45,
+      0,
+      "LATENT"
+    ],
+    [
+      49,
+      40,
+      2,
+      45,
+      1,
+      "VAE"
+    ],
+    [
+      50,
+      40,
+      1,
+      41,
+      0,
+      "CLIP"
+    ],
+    [
+      51,
+      40,
+      1,
+      42,
+      0,
+      "CLIP"
+    ],
+    [
+      55,
+      45,
+      0,
+      48,
+      0,
+      "IMAGE"
+    ],
+    [
+      59,
+      52,
+      0,
+      39,
+      3,
+      "LATENT"
+    ],
+    [
+      60,
+      11,
+      0,
+      53,
+      0,
+      "IMAGE"
+    ],
+    [
+      61,
+      53,
+      0,
+      41,
+      1,
+      "STRING"
+    ]
+  ],
+  "groups": [],
+  "config": {},
+  "extra": {
+    "ds": {
+      "scale": 0.9090909090909091,
+      "offset": {
+        "0": 278.52736579431155,
+        "1": -323.6237095104226
+      }
+    }
+  },
+  "version": 0.4
+}