Freak-ppa commited on
Commit
b298049
·
verified ·
1 Parent(s): 311187c

Upload 18 files

Browse files
.gitattributes CHANGED
@@ -62,3 +62,6 @@ ComfyUI/temp/ComfyUI_temp_lhrdf_00001_.png filter=lfs diff=lfs merge=lfs -text
62
  ComfyUI/temp/ComfyUI_temp_lhrdf_00002_.png filter=lfs diff=lfs merge=lfs -text
63
  ComfyUI/temp/ComfyUI_temp_pxrdj_00001_.png filter=lfs diff=lfs merge=lfs -text
64
  ComfyUI/temp/ComfyUI_temp_pxrdj_00002_.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
62
  ComfyUI/temp/ComfyUI_temp_lhrdf_00002_.png filter=lfs diff=lfs merge=lfs -text
63
  ComfyUI/temp/ComfyUI_temp_pxrdj_00001_.png filter=lfs diff=lfs merge=lfs -text
64
  ComfyUI/temp/ComfyUI_temp_pxrdj_00002_.png filter=lfs diff=lfs merge=lfs -text
65
+ ComfyUI/custom_nodes/img2txt-comfyui-nodes/wiki/demo-pics/Selection_001.png filter=lfs diff=lfs merge=lfs -text
66
+ ComfyUI/custom_nodes/img2txt-comfyui-nodes/wiki/demo-pics/Selection_002.png filter=lfs diff=lfs merge=lfs -text
67
+ ComfyUI/custom_nodes/img2txt-comfyui-nodes/wiki/demo-pics/Selection_003.png filter=lfs diff=lfs merge=lfs -text
ComfyUI/custom_nodes/img2txt-comfyui-nodes/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from .src.img2txt_node import Img2TxtNode
2
+
3
+ NODE_CLASS_MAPPINGS = {
4
+ "img2txt BLIP/Llava Multimodel Tagger": Img2TxtNode,
5
+ }
6
+ NODE_DISPLAY_NAME_MAPPINGS = {
7
+ "img2txt BLIP/Llava Multimodel Tagger": "Image to Text - Auto Caption"
8
+ }
9
+ WEB_DIRECTORY = "./web"
ComfyUI/custom_nodes/img2txt-comfyui-nodes/pyproject.toml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "img2txt-comfyui-nodes"
3
+ description = "Get general description or specify questions to ask about images (medium, art style, background, etc.). Supports Chinese 🇨🇳 questions via MiniCPM model."
4
+ version = "1.1.4"
5
+ license = "LICENSE"
6
+ dependencies = ["transformers>=4.36.0", "bitsandbytes>=0.43.0", "timm>=1.0.7", "sentencepiece==0.1.99", "accelerate>=0.3.0", "deepspeed"]
7
+
8
+ [project.urls]
9
+ Repository = "https://github.com/christian-byrne/img2txt-comfyui-nodes"
10
+ # Used by Comfy Registry https://comfyregistry.org
11
+
12
+ [tool.comfy]
13
+ PublisherId = "christian-byrne"
14
+ DisplayName = "Img2txt - Auto Caption"
15
+ Icon = "https://img.icons8.com/?size=100&id=49374&format=png&color=000000"
ComfyUI/custom_nodes/img2txt-comfyui-nodes/requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ transformers>=4.36.0
2
+ bitsandbytes>=0.43.0
3
+ timm>=1.0.7
4
+ sentencepiece
5
+ accelerate>=0.3.0
6
+ deepspeed
ComfyUI/custom_nodes/img2txt-comfyui-nodes/src/__init__.py ADDED
File without changes
ComfyUI/custom_nodes/img2txt-comfyui-nodes/src/blip_img2txt.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from PIL import Image
3
+ from transformers import (
4
+ BlipProcessor,
5
+ BlipForConditionalGeneration,
6
+ BlipConfig,
7
+ BlipTextConfig,
8
+ BlipVisionConfig,
9
+ )
10
+
11
+ import torch
12
+ import model_management
13
+ import folder_paths
14
+
15
+ class BLIPImg2Txt:
16
+ def __init__(
17
+ self,
18
+ conditional_caption: str,
19
+ min_words: int,
20
+ max_words: int,
21
+ temperature: float,
22
+ repetition_penalty: float,
23
+ search_beams: int,
24
+ model_id: str = "Salesforce/blip-image-captioning-large",
25
+ custom_model_path: str = None,
26
+ ):
27
+ self.conditional_caption = conditional_caption
28
+ self.model_id = model_id
29
+ self.custom_model_path = custom_model_path
30
+
31
+ if self.custom_model_path and os.path.exists(self.custom_model_path):
32
+ self.model_path = self.custom_model_path
33
+ else:
34
+ self.model_path = folder_paths.get_full_path("blip", model_id)
35
+
36
+ if temperature > 1.1 or temperature < 0.90:
37
+ do_sample = True
38
+ num_beams = 1
39
+ else:
40
+ do_sample = False
41
+ num_beams = search_beams if search_beams > 1 else 1
42
+
43
+ self.text_config_kwargs = {
44
+ "do_sample": do_sample,
45
+ "max_length": max_words,
46
+ "min_length": min_words,
47
+ "repetition_penalty": repetition_penalty,
48
+ "padding": "max_length",
49
+ }
50
+ if not do_sample:
51
+ self.text_config_kwargs["temperature"] = temperature
52
+ self.text_config_kwargs["num_beams"] = num_beams
53
+
54
+ def generate_caption(self, image: Image.Image) -> str:
55
+ if image.mode != "RGB":
56
+ image = image.convert("RGB")
57
+
58
+ if self.model_path and os.path.exists(self.model_path):
59
+ model_path = self.model_path
60
+ local_files_only = True
61
+ else:
62
+ model_path = self.model_id
63
+ local_files_only = False
64
+
65
+ processor = BlipProcessor.from_pretrained(model_path, local_files_only=local_files_only)
66
+
67
+ config_text = BlipTextConfig.from_pretrained(model_path, local_files_only=local_files_only)
68
+ config_text.update(self.text_config_kwargs)
69
+ config_vision = BlipVisionConfig.from_pretrained(model_path, local_files_only=local_files_only)
70
+ config = BlipConfig.from_text_vision_configs(config_text, config_vision)
71
+
72
+ model = BlipForConditionalGeneration.from_pretrained(
73
+ model_path,
74
+ config=config,
75
+ torch_dtype=torch.float16,
76
+ local_files_only=local_files_only
77
+ ).to(model_management.get_torch_device())
78
+
79
+ inputs = processor(
80
+ image,
81
+ self.conditional_caption,
82
+ return_tensors="pt",
83
+ ).to(model_management.get_torch_device(), torch.float16)
84
+
85
+ with torch.no_grad():
86
+ out = model.generate(**inputs)
87
+ ret = processor.decode(out[0], skip_special_tokens=True)
88
+
89
+ del model
90
+ torch.cuda.empty_cache()
91
+
92
+ return ret
ComfyUI/custom_nodes/img2txt-comfyui-nodes/src/description_classifier.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #!pip install transformers[sentencepiece]
2
+ # from transformers import pipeline
3
+ # text = "Angela Merkel is a politician in Germany and leader of the CDU"
4
+ # hypothesis_template = "This text is about {}"
5
+ # classes_verbalized = ["politics", "economy", "entertainment", "environment"]
6
+ # zeroshot_classifier = pipeline("zero-shot-classification", model="MoritzLaurer/deberta-v3-large-zeroshot-v2.0") # change the model identifier here
7
+ # output = zeroshot_classifier(text, classes_verbalized, hypothesis_template=hypothesis_template, multi_label=False)
8
+ # print(output)
ComfyUI/custom_nodes/img2txt-comfyui-nodes/src/img2txt_node.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ @author: christian-byrne
3
+ @title: Img2Txt auto captioning. Choose from models: BLIP, Llava, MiniCPM, MS-GIT. Use model combos and merge results. Specify questions to ask about images (medium, art style, background). Supports Chinese 🇨🇳 questions via MiniCPM.
4
+ @nickname: Image to Text - Auto Caption
5
+ """
6
+
7
+ import torch
8
+ from torchvision import transforms
9
+
10
+ from .img_tensor_utils import TensorImgUtils
11
+ from .llava_img2txt import LlavaImg2Txt
12
+ from .blip_img2txt import BLIPImg2Txt
13
+ from .mini_cpm_img2txt import MiniPCMImg2Txt
14
+
15
+ from typing import Tuple
16
+
17
+ import os
18
+ import folder_paths
19
+
20
+ class Img2TxtNode:
21
+ CATEGORY = "img2txt"
22
+
23
+ @classmethod
24
+ def INPUT_TYPES(s):
25
+ return {
26
+ "required": {
27
+ "input_image": ("IMAGE",),
28
+ },
29
+ "optional": {
30
+ "use_blip_model": (
31
+ "BOOLEAN",
32
+ {
33
+ "default": True,
34
+ "label_on": "Use BLIP (Requires 2Gb Disk)",
35
+ "label_off": "Don't use BLIP",
36
+ },
37
+ ),
38
+ "use_llava_model": (
39
+ "BOOLEAN",
40
+ {
41
+ "default": False,
42
+ "label_on": "Use Llava (Requires 15Gb Disk)",
43
+ "label_off": "Don't use Llava",
44
+ },
45
+ ),
46
+ "use_mini_pcm_model": (
47
+ "BOOLEAN",
48
+ {
49
+ "default": False,
50
+ "label_on": "Use MiniCPM (Requires 6Gb Disk)",
51
+ "label_off": "Don't use MiniCPM",
52
+ },
53
+ ),
54
+ "use_all_models": (
55
+ "BOOLEAN",
56
+ {
57
+ "default": False,
58
+ "label_on": "Use all models and combine outputs (Total Size: 20+Gb)",
59
+ "label_off": "Use selected models only",
60
+ },
61
+ ),
62
+ "blip_caption_prefix": (
63
+ "STRING",
64
+ {
65
+ "default": "a photograph of",
66
+ },
67
+ ),
68
+ "prompt_questions": (
69
+ "STRING",
70
+ {
71
+ "default": "What is the subject of this image?\nWhat are the mediums used to make this?\nWhat are the artistic styles this is reminiscent of?\nWhich famous artists is this reminiscent of?\nHow sharp or detailed is this image?\nWhat is the environment and background of this image?\nWhat are the objects in this image?\nWhat is the composition of this image?\nWhat is the color palette in this image?\nWhat is the lighting in this image?",
72
+ "multiline": True,
73
+ },
74
+ ),
75
+ "temperature": (
76
+ "FLOAT",
77
+ {
78
+ "default": 0.8,
79
+ "min": 0.1,
80
+ "max": 2.0,
81
+ "step": 0.01,
82
+ "display": "slider",
83
+ },
84
+ ),
85
+ "repetition_penalty": (
86
+ "FLOAT",
87
+ {
88
+ "default": 1.2,
89
+ "min": 0.1,
90
+ "max": 2.0,
91
+ "step": 0.01,
92
+ "display": "slider",
93
+ },
94
+ ),
95
+ "min_words": ("INT", {"default": 36}),
96
+ "max_words": ("INT", {"default": 128}),
97
+ "search_beams": ("INT", {"default": 5}),
98
+ "exclude_terms": (
99
+ "STRING",
100
+ {
101
+ "default": "watermark, text, writing",
102
+ },
103
+ ),
104
+ },
105
+ "hidden": {
106
+ "unique_id": "UNIQUE_ID",
107
+ "extra_pnginfo": "EXTRA_PNGINFO",
108
+ "output_text": (
109
+ "STRING",
110
+ {
111
+ "default": "",
112
+ },
113
+ ),
114
+ },
115
+ }
116
+
117
+ RETURN_TYPES = ("STRING",)
118
+ RETURN_NAMES = ("caption",)
119
+ FUNCTION = "main"
120
+ OUTPUT_NODE = True
121
+
122
+ def main(
123
+ self,
124
+ input_image: torch.Tensor, # [Batch_n, H, W, 3-channel]
125
+ use_blip_model: bool,
126
+ use_llava_model: bool,
127
+ use_all_models: bool,
128
+ use_mini_pcm_model: bool,
129
+ blip_caption_prefix: str,
130
+ prompt_questions: str,
131
+ temperature: float,
132
+ repetition_penalty: float,
133
+ min_words: int,
134
+ max_words: int,
135
+ search_beams: int,
136
+ exclude_terms: str,
137
+ output_text: str = "",
138
+ unique_id=None,
139
+ extra_pnginfo=None,
140
+ ) -> Tuple[str, ...]:
141
+ raw_image = transforms.ToPILImage()(
142
+ TensorImgUtils.convert_to_type(input_image, "CHW")
143
+ ).convert("RGB")
144
+
145
+ if blip_caption_prefix == "":
146
+ blip_caption_prefix = "a photograph of"
147
+
148
+ captions = []
149
+ if use_all_models or use_blip_model:
150
+ blip_model_path = folder_paths.get_folder_paths("blip")[0]
151
+ print(f"blip_model_path: {blip_model_path}")
152
+ if not blip_model_path or not os.path.exists(blip_model_path):
153
+ raise ValueError("BLIP model 'blip-image-captioning-large' not found in ComfyUI models directory. Please ensure it's in the 'models/blip' folder.")
154
+
155
+ blip = BLIPImg2Txt(
156
+ conditional_caption=blip_caption_prefix,
157
+ min_words=min_words,
158
+ max_words=max_words,
159
+ temperature=temperature,
160
+ repetition_penalty=repetition_penalty,
161
+ search_beams=search_beams,
162
+ custom_model_path=blip_model_path
163
+ )
164
+ captions.append(blip.generate_caption(raw_image))
165
+
166
+ if use_all_models or use_llava_model:
167
+ llava_questions = prompt_questions.split("\n")
168
+ llava_questions = [
169
+ q
170
+ for q in llava_questions
171
+ if q != "" and q != " " and q != "\n" and q != "\n\n"
172
+ ]
173
+ if len(llava_questions) > 0:
174
+ llava = LlavaImg2Txt(
175
+ question_list=llava_questions,
176
+ model_id="llava-hf/llava-1.5-7b-hf",
177
+ use_4bit_quantization=True,
178
+ use_low_cpu_mem=True,
179
+ use_flash2_attention=False,
180
+ max_tokens_per_chunk=300,
181
+ )
182
+ captions.append(llava.generate_caption(raw_image))
183
+
184
+ if use_all_models or use_mini_pcm_model:
185
+ mini_pcm = MiniPCMImg2Txt(
186
+ question_list=prompt_questions.split("\n"),
187
+ temperature=temperature,
188
+ )
189
+ captions.append(mini_pcm.generate_captions(raw_image))
190
+
191
+ out_string = self.exclude(exclude_terms, self.merge_captions(captions))
192
+
193
+ return {"ui": {"text": out_string}, "result": (out_string,)}
194
+
195
+ def merge_captions(self, captions: list) -> str:
196
+ """Merge captions from multiple models into one string.
197
+ Necessary because we can expect the generated captions will generally
198
+ be comma-separated fragments ordered by relevance - so combine
199
+ fragments in an alternating order."""
200
+ merged_caption = ""
201
+ captions = [c.split(",") for c in captions]
202
+ for i in range(max(len(c) for c in captions)):
203
+ for j in range(len(captions)):
204
+ if i < len(captions[j]) and captions[j][i].strip() != "":
205
+ merged_caption += captions[j][i].strip() + ", "
206
+ return merged_caption
207
+
208
+ def exclude(self, exclude_terms: str, out_string: str) -> str:
209
+ # https://huggingface.co/Salesforce/blip-image-captioning-large/discussions/20
210
+ exclude_terms = "arafed," + exclude_terms
211
+ exclude_terms = [
212
+ term.strip().lower() for term in exclude_terms.split(",") if term != ""
213
+ ]
214
+ for term in exclude_terms:
215
+ out_string = out_string.replace(term, "")
216
+
217
+ return out_string
ComfyUI/custom_nodes/img2txt-comfyui-nodes/src/img_tensor_utils.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from typing import Tuple
3
+
4
+
5
+ class TensorImgUtils:
6
+ @staticmethod
7
+ def from_to(from_type: list[str], to_type: list[str]):
8
+ """Return a function that converts a tensor from one type to another. Args can be lists of strings or just strings (e.g., ["C", "H", "W"] or just "CHW")."""
9
+ if isinstance(from_type, list):
10
+ from_type = "".join(from_type)
11
+ if isinstance(to_type, list):
12
+ to_type = "".join(to_type)
13
+
14
+ permute_arg = [from_type.index(c) for c in to_type]
15
+
16
+ def convert(tensor: torch.Tensor) -> torch.Tensor:
17
+ return tensor.permute(permute_arg)
18
+
19
+ return convert
20
+
21
+ @staticmethod
22
+ def convert_to_type(tensor: torch.Tensor, to_type: str) -> torch.Tensor:
23
+ """Convert a tensor to a specific type."""
24
+ from_type = TensorImgUtils.identify_type(tensor)[0]
25
+ if from_type == list(to_type):
26
+ return tensor
27
+
28
+ if len(from_type) == 4 and len(to_type) == 3:
29
+ # If converting from a batched tensor to a non-batched tensor, squeeze the batch dimension
30
+ tensor = tensor.squeeze(0)
31
+ from_type = from_type[1:]
32
+ if len(from_type) == 3 and len(to_type) == 4:
33
+ # If converting from a non-batched tensor to a batched tensor, unsqueeze the batch dimension
34
+ tensor = tensor.unsqueeze(0)
35
+ from_type = ["B"] + from_type
36
+
37
+ return TensorImgUtils.from_to(from_type, list(to_type))(tensor)
38
+
39
+ @staticmethod
40
+ def identify_type(tensor: torch.Tensor) -> Tuple[list[str], str]:
41
+ """Identify the type of image tensor. Doesn't currently check for BHW. Returns one of the following:"""
42
+ dim_n = tensor.dim()
43
+ if dim_n == 2:
44
+ return (["H", "W"], "HW")
45
+ elif dim_n == 3: # HWA, AHW, HWC, or CHW
46
+ if tensor.size(2) == 3:
47
+ return (["H", "W", "C"], "HWRGB")
48
+ elif tensor.size(2) == 4:
49
+ return (["H", "W", "C"], "HWRGBA")
50
+ elif tensor.size(0) == 3:
51
+ return (["C", "H", "W"], "RGBHW")
52
+ elif tensor.size(0) == 4:
53
+ return (["C", "H", "W"], "RGBAHW")
54
+ elif tensor.size(2) == 1:
55
+ return (["H", "W", "C"], "HWA")
56
+ elif tensor.size(0) == 1:
57
+ return (["C", "H", "W"], "AHW")
58
+ elif dim_n == 4: # BHWC or BCHW
59
+ if tensor.size(3) >= 3: # BHWRGB or BHWRGBA
60
+ if tensor.size(3) == 3:
61
+ return (["B", "H", "W", "C"], "BHWRGB")
62
+ elif tensor.size(3) == 4:
63
+ return (["B", "H", "W", "C"], "BHWRGBA")
64
+
65
+ elif tensor.size(1) >= 3:
66
+ if tensor.size(1) == 3:
67
+ return (["B", "C", "H", "W"], "BRGBHW")
68
+ elif tensor.size(1) == 4:
69
+ return (["B", "C", "H", "W"], "BRGBAHW")
70
+
71
+ else:
72
+ raise ValueError(
73
+ f"{dim_n} dimensions is not a valid number of dimensions for an image tensor."
74
+ )
75
+
76
+ raise ValueError(
77
+ f"Could not determine shape of Tensor with {dim_n} dimensions and {tensor.shape} shape."
78
+ )
79
+
80
+ @staticmethod
81
+ def test_squeeze_batch(tensor: torch.Tensor, strict=False) -> torch.Tensor:
82
+ # Check if the tensor has a batch dimension (size 4)
83
+ if tensor.dim() == 4:
84
+ if tensor.size(0) == 1 or not strict:
85
+ # If it has a batch dimension with size 1, remove it. It represents a single image.
86
+ return tensor.squeeze(0)
87
+ else:
88
+ raise ValueError(
89
+ f"This is not a single image. It's a batch of {tensor.size(0)} images."
90
+ )
91
+ else:
92
+ # Otherwise, it doesn't have a batch dimension, so just return the tensor as is.
93
+ return tensor
94
+
95
+ @staticmethod
96
+ def test_unsqueeze_batch(tensor: torch.Tensor) -> torch.Tensor:
97
+ # Check if the tensor has a batch dimension (size 4)
98
+ if tensor.dim() == 3:
99
+ # If it doesn't have a batch dimension, add one. It represents a single image.
100
+ return tensor.unsqueeze(0)
101
+ else:
102
+ # Otherwise, it already has a batch dimension, so just return the tensor as is.
103
+ return tensor
104
+
105
+ @staticmethod
106
+ def most_pixels(img_tensors: list[torch.Tensor]) -> torch.Tensor:
107
+ sizes = [
108
+ TensorImgUtils.height_width(img)[0] * TensorImgUtils.height_width(img)[1]
109
+ for img in img_tensors
110
+ ]
111
+ return img_tensors[sizes.index(max(sizes))]
112
+
113
+ @staticmethod
114
+ def height_width(image: torch.Tensor) -> Tuple[int, int]:
115
+ """Like torchvision.transforms methods, this method assumes Tensor to
116
+ have [..., H, W] shape, where ... means an arbitrary number of leading
117
+ dimensions
118
+ """
119
+ return image.shape[-2:]
120
+
121
+ @staticmethod
122
+ def smaller_axis(image: torch.Tensor) -> int:
123
+ h, w = TensorImgUtils.height_width(image)
124
+ return 2 if h < w else 3
125
+
126
+ @staticmethod
127
+ def larger_axis(image: torch.Tensor) -> int:
128
+ h, w = TensorImgUtils.height_width(image)
129
+ return 2 if h > w else 3
ComfyUI/custom_nodes/img2txt-comfyui-nodes/src/keyword_extract.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModelForTokenClassification
3
+ from nltk.tokenize import word_tokenize
4
+ from nltk.corpus import stopwords
5
+ from nltk import pos_tag
6
+ from nltk.tokenize import word_tokenize
7
+ import nltk
8
+
9
+
10
+ def nltk_speach_tag(sentence):
11
+ nltk.download("punkt")
12
+ nltk.download("averaged_perceptron_tagger")
13
+ nltk.download("stopwords")
14
+
15
+ # Tokenize the sentence
16
+ tokens = word_tokenize(sentence)
17
+
18
+ # Filter out stopwords and punctuation
19
+ stop_words = set(stopwords.words("english"))
20
+ filtered_tokens = [
21
+ word for word in tokens if word.lower() not in stop_words and word.isalnum()
22
+ ]
23
+
24
+ # Perform Part-of-Speech tagging
25
+ tagged_tokens = pos_tag(filtered_tokens)
26
+
27
+ # Extract nouns and proper nouns
28
+ salient_tokens = [
29
+ token
30
+ for token, pos in tagged_tokens
31
+ if pos in ["NN", "NNP", "NNS", "NNPS", "ADJ", "JJ", "FW"]
32
+ ]
33
+ salient_tokens = list(set(salient_tokens))
34
+
35
+ # Re-add commas or periods relative to the original sentence
36
+
37
+ comma_period_indices = [i for i, char in enumerate(sentence) if char in [",", "."]]
38
+ salient_tokens_indices = [sentence.index(token) for token in salient_tokens]
39
+
40
+ # Add commas or periods between words if there was one in the original sentence
41
+ out = ""
42
+ for i, index in enumerate(salient_tokens_indices):
43
+ out += salient_tokens[i]
44
+ distance_between_next = (
45
+ salient_tokens_indices[i + 1] - index
46
+ if i + 1 < len(salient_tokens_indices)
47
+ else None
48
+ )
49
+
50
+ puncuated = False
51
+ if not distance_between_next:
52
+ puncuated = True
53
+ else:
54
+ for i in range(index, index + distance_between_next):
55
+ if i in comma_period_indices:
56
+ puncuated = True
57
+ break
58
+
59
+ if not puncuated:
60
+ # IF the previous word was an adjective, and current is a noun, add a space
61
+ if (
62
+ i > 0
63
+ and tagged_tokens[i - 1][1] in ["JJ", "ADJ"]
64
+ and tagged_tokens[i][1] in ["NN", "NNP", "NNS", "NNPS"]
65
+ ):
66
+ out += " "
67
+ else:
68
+ out += ", "
69
+ else:
70
+ out += ". "
71
+
72
+ # Add the last token
73
+ out += sentence[-1]
74
+
75
+ # Print the salient tokens
76
+ return out.strip().strip(",").strip(".").strip()
77
+
78
+
79
+ def extract_keywords(text: str) -> str:
80
+ tokenizer = AutoTokenizer.from_pretrained("yanekyuk/bert-keyword-extractor")
81
+ model = AutoModelForTokenClassification.from_pretrained(
82
+ "yanekyuk/bert-keyword-extractor"
83
+ )
84
+ """Return keywords from text using a BERT model trained for keyword extraction as
85
+ a comma-separated string."""
86
+ print(f"Extracting keywords from text: {text}")
87
+
88
+ for char in ["\n", "\t", "\r"]:
89
+ text = text.replace(char, " ")
90
+
91
+ sentences = text.split(".")
92
+ result = ""
93
+
94
+ for sentence in sentences:
95
+ print(f"Extracting keywords from sentence: {sentence}")
96
+ inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
97
+ with torch.no_grad():
98
+ logits = model(**inputs).logits
99
+
100
+ predicted_token_class_ids = logits.argmax(dim=-1)
101
+
102
+ predicted_keywords = []
103
+ for token_id, token in zip(
104
+ predicted_token_class_ids[0],
105
+ tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]),
106
+ ):
107
+ if token_id == 1:
108
+ predicted_keywords.append(token)
109
+
110
+ print(f"Extracted keywords: {predicted_keywords}")
111
+ result += ", ".join(predicted_keywords) + ", "
112
+
113
+ print(f"All Keywords: {result}")
114
+ return result
ComfyUI/custom_nodes/img2txt-comfyui-nodes/src/llava_img2txt.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ import torch
3
+ import model_management
4
+ from transformers import AutoProcessor, LlavaForConditionalGeneration, BitsAndBytesConfig
5
+
6
+
7
+ class LlavaImg2Txt:
8
+ """
9
+ A class to generate text captions for images using the Llava model.
10
+
11
+ Args:
12
+ question_list (list[str]): A list of questions to ask the model about the image.
13
+ model_id (str): The model's name in the Hugging Face model hub.
14
+ use_4bit_quantization (bool): Whether to use 4-bit quantization to reduce memory usage. 4-bit quantization reduces the precision of model parameters, potentially affecting the quality of generated outputs. Use if VRAM is limited. Default is True.
15
+ use_low_cpu_mem (bool): In low_cpu_mem_usage mode, the model is initialized with optimizations aimed at reducing CPU memory consumption. This can be beneficial when working with large models or limited computational resources. Default is True.
16
+ use_flash2_attention (bool): Whether to use Flash-Attention 2. Flash-Attention 2 focuses on optimizing attention mechanisms, which are crucial for the model's performance during generation. Use if computational resources are abundant. Default is False.
17
+ max_tokens_per_chunk (int): The maximum number of tokens to generate per prompt chunk. Default is 300.
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ question_list,
23
+ model_id: str = "llava-hf/llava-1.5-7b-hf",
24
+ use_4bit_quantization: bool = True,
25
+ use_low_cpu_mem: bool = True,
26
+ use_flash2_attention: bool = False,
27
+ max_tokens_per_chunk: int = 300,
28
+ ):
29
+ self.question_list = question_list
30
+ self.model_id = model_id
31
+ self.use_4bit = use_4bit_quantization
32
+ self.use_flash2 = use_flash2_attention
33
+ self.use_low_cpu_mem = use_low_cpu_mem
34
+ self.max_tokens_per_chunk = max_tokens_per_chunk
35
+
36
+ def generate_caption(
37
+ self,
38
+ raw_image: Image.Image,
39
+ ) -> str:
40
+ """
41
+ Generate a caption for an image using the Llava model.
42
+
43
+ Args:
44
+ raw_image (Image): Image to generate caption for
45
+ """
46
+ # Convert Image to RGB first
47
+ if raw_image.mode != "RGB":
48
+ raw_image = raw_image.convert("RGB")
49
+
50
+ dtype = torch.float16
51
+ quant_config = BitsAndBytesConfig(
52
+ load_in_4bit=self.use_4bit,
53
+ bnb_4bit_compute_dtype=dtype,
54
+ bnb_4bit_quant_type="fp4"
55
+ )
56
+
57
+ model = LlavaForConditionalGeneration.from_pretrained(
58
+ self.model_id,
59
+ torch_dtype=dtype,
60
+ low_cpu_mem_usage=self.use_low_cpu_mem,
61
+ use_flash_attention_2=self.use_flash2,
62
+ quantization_config=quant_config,
63
+ )
64
+
65
+ # model.to() is not supported for 4-bit or 8-bit bitsandbytes models. With 4-bit quantization, use the model as it is, since the model will already be set to the correct devices and casted to the correct `dtype`.
66
+ if torch.cuda.is_available() and not self.use_4bit:
67
+ model = model.to(model_management.get_torch_device(), torch.float16)
68
+
69
+ processor = AutoProcessor.from_pretrained(self.model_id)
70
+ prompt_chunks = self.__get_prompt_chunks(chunk_size=4)
71
+
72
+ caption = ""
73
+ with torch.no_grad():
74
+ for prompt_list in prompt_chunks:
75
+ prompt = self.__get_single_answer_prompt(prompt_list)
76
+ inputs = processor(prompt, raw_image, return_tensors="pt").to(
77
+ model_management.get_torch_device(), torch.float16
78
+ )
79
+ output = model.generate(
80
+ **inputs, max_new_tokens=self.max_tokens_per_chunk, do_sample=False
81
+ )
82
+ decoded = processor.decode(output[0][2:])
83
+ cleaned = self.clean_output(decoded)
84
+ caption += cleaned
85
+
86
+ del model
87
+ torch.cuda.empty_cache()
88
+
89
+ return caption
90
+
91
+ def clean_output(self, decoded_output, delimiter=","):
92
+ output_only = decoded_output.split("ASSISTANT: ")[1]
93
+ lines = output_only.split("\n")
94
+ cleaned_output = ""
95
+ for line in lines:
96
+ cleaned_output += self.__replace_delimiter(line, ".", delimiter)
97
+
98
+ return cleaned_output
99
+
100
+ def __get_single_answer_prompt(self, questions):
101
+ """
102
+ For multiple turns conversation:
103
+ "USER: <image>\n<prompt1> ASSISTANT: <answer1></s>USER: <prompt2> ASSISTANT: <answer2></s>USER: <prompt3> ASSISTANT:"
104
+ From: https://huggingface.co/docs/transformers/en/model_doc/llava#usage-tips
105
+ Not sure how the formatting works for multi-turn but those are the docs.
106
+ """
107
+ prompt = "USER: <image>\n"
108
+ for index, question in enumerate(questions):
109
+ if index != 0:
110
+ prompt += "USER: "
111
+ prompt += f"{question} </s >"
112
+ prompt += "ASSISTANT: "
113
+
114
+ return prompt
115
+
116
+ def __replace_delimiter(self, text: str, old, new=","):
117
+ """Replace only the LAST instance of old with new"""
118
+ if old not in text:
119
+ return text.strip() + " "
120
+ last_old_index = text.rindex(old)
121
+ replaced = text[:last_old_index] + new + text[last_old_index + len(old) :]
122
+ return replaced.strip() + " "
123
+
124
+ def __get_prompt_chunks(self, chunk_size=4):
125
+ prompt_chunks = []
126
+ for index, feature in enumerate(self.question_list):
127
+ if index % chunk_size == 0:
128
+ prompt_chunks.append([feature])
129
+ else:
130
+ prompt_chunks[-1].append(feature)
131
+ return prompt_chunks
ComfyUI/custom_nodes/img2txt-comfyui-nodes/src/mini_cpm_img2txt.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from PIL import Image
3
+ from transformers import AutoModel, AutoTokenizer
4
+
5
+ import model_management
6
+
7
+ class MiniPCMImg2Txt:
8
+ def __init__(self, question_list: list[str], temperature: float = 0.7):
9
+ self.model_id = "openbmb/MiniCPM-V-2"
10
+ self.question_list = question_list
11
+ self.question_list = self.__create_question_list()
12
+ self.temperature = temperature
13
+
14
+ def __create_question_list(self) -> list:
15
+ ret = []
16
+ for q in self.question_list:
17
+ ret.append({"role": "user", "content": q})
18
+ return ret
19
+
20
+ def generate_captions(self, raw_image: Image.Image) -> str:
21
+ device = model_management.get_torch_device()
22
+
23
+ # For Nvidia GPUs support BF16 (like A100, H100, RTX3090)
24
+ # For Nvidia GPUs do NOT support BF16 (like V100, T4, RTX2080)
25
+ torch_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
26
+
27
+ model = AutoModel.from_pretrained(
28
+ "openbmb/MiniCPM-V-2", trust_remote_code=True, torch_dtype=torch_dtype
29
+ )
30
+ model = model.to(device=device, dtype=torch_dtype)
31
+
32
+ tokenizer = AutoTokenizer.from_pretrained(
33
+ self.model_id, trust_remote_code=True
34
+ )
35
+ model.eval()
36
+
37
+ if raw_image.mode != "RGB":
38
+ raw_image = raw_image.convert("RGB")
39
+
40
+ with torch.no_grad():
41
+ res, _, _ = model.chat(
42
+ image=raw_image,
43
+ msgs=self.question_list,
44
+ context=None,
45
+ tokenizer=tokenizer,
46
+ sampling=True,
47
+ temperature=self.temperature,
48
+ )
49
+
50
+ del model
51
+ torch.cuda.empty_cache()
52
+
53
+ return res
ComfyUI/custom_nodes/img2txt-comfyui-nodes/web/show-output-text.js ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { app } from "../../../scripts/app.js";
2
+ import { ComfyWidgets } from "../../../scripts/widgets.js";
3
+
4
+ // Displays output caption text
5
+ app.registerExtension({
6
+ name: "Img2TxtNode",
7
+ async beforeRegisterNodeDef(nodeType, nodeData, app) {
8
+ if (nodeData.name === "img2txt BLIP/Llava Multimodel Tagger") {
9
+ function populate(message) {
10
+ console.log("message", message);
11
+ console.log("message.text", message.text);
12
+
13
+ const insertIndex = this.widgets.findIndex((w) => w.name === "output_text");
14
+ if (insertIndex !== -1) {
15
+ for (let i = insertIndex; i < this.widgets.length; i++) {
16
+ this.widgets[i].onRemove?.();
17
+ }
18
+ this.widgets.length = insertIndex;
19
+ }
20
+
21
+ const outputWidget = ComfyWidgets["STRING"](
22
+ this,
23
+ "output_text",
24
+ ["STRING", { multiline: true }],
25
+ app
26
+ ).widget;
27
+ outputWidget.inputEl.readOnly = true;
28
+ outputWidget.inputEl.style.opacity = 0.6;
29
+ outputWidget.value = message.text.join("");
30
+
31
+ requestAnimationFrame(() => {
32
+ const size_ = this.computeSize();
33
+ if (size_[0] < this.size[0]) {
34
+ size_[0] = this.size[0];
35
+ }
36
+ if (size_[1] < this.size[1]) {
37
+ size_[1] = this.size[1];
38
+ }
39
+ this.onResize?.(size_);
40
+ app.graph.setDirtyCanvas(true, false);
41
+ });
42
+ }
43
+
44
+ const onExecuted = nodeType.prototype.onExecuted;
45
+ nodeType.prototype.onExecuted = function (message) {
46
+ onExecuted?.apply(this, arguments);
47
+ populate.call(this, message);
48
+ };
49
+ }
50
+ },
51
+ });
ComfyUI/custom_nodes/img2txt-comfyui-nodes/wiki/demo-pics/Selection_001.png ADDED

Git LFS Details

  • SHA256: c71dc3dab484d9362680510fbbfe725e0cd988e0575b79acf339e7296faedb3a
  • Pointer size: 133 Bytes
  • Size of remote file: 13.7 MB
ComfyUI/custom_nodes/img2txt-comfyui-nodes/wiki/demo-pics/Selection_002.png ADDED

Git LFS Details

  • SHA256: c2db46defa1b80a63256d4f0d85dc010e6950ae30f56b15c86bd1871469d4783
  • Pointer size: 133 Bytes
  • Size of remote file: 13.8 MB
ComfyUI/custom_nodes/img2txt-comfyui-nodes/wiki/demo-pics/Selection_003.png ADDED

Git LFS Details

  • SHA256: 006a87fa5d86a9addc5953a3c9f6fd20b9bbf06efe328a9412bd5277bfd4aeb5
  • Pointer size: 132 Bytes
  • Size of remote file: 9.54 MB
ComfyUI/custom_nodes/img2txt-comfyui-nodes/wiki/workflow-examples/img2img.json ADDED
@@ -0,0 +1,523 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "last_node_id": 51,
3
+ "last_link_id": 60,
4
+ "nodes": [
5
+ {
6
+ "id": 41,
7
+ "type": "CLIPTextEncode",
8
+ "pos": [
9
+ 1055,
10
+ 571
11
+ ],
12
+ "size": {
13
+ "0": 348.9403381347656,
14
+ "1": 56.439388275146484
15
+ },
16
+ "flags": {},
17
+ "order": 5,
18
+ "mode": 0,
19
+ "inputs": [
20
+ {
21
+ "name": "clip",
22
+ "type": "CLIP",
23
+ "link": 50
24
+ },
25
+ {
26
+ "name": "text",
27
+ "type": "STRING",
28
+ "link": 60,
29
+ "widget": {
30
+ "name": "text"
31
+ }
32
+ }
33
+ ],
34
+ "outputs": [
35
+ {
36
+ "name": "CONDITIONING",
37
+ "type": "CONDITIONING",
38
+ "links": [
39
+ 44
40
+ ],
41
+ "shape": 3,
42
+ "slot_index": 0
43
+ }
44
+ ],
45
+ "properties": {
46
+ "Node name for S&R": "CLIPTextEncode"
47
+ },
48
+ "widgets_values": [
49
+ ""
50
+ ]
51
+ },
52
+ {
53
+ "id": 39,
54
+ "type": "KSampler",
55
+ "pos": [
56
+ 1587,
57
+ 982
58
+ ],
59
+ "size": {
60
+ "0": 315,
61
+ "1": 262
62
+ },
63
+ "flags": {},
64
+ "order": 6,
65
+ "mode": 0,
66
+ "inputs": [
67
+ {
68
+ "name": "model",
69
+ "type": "MODEL",
70
+ "link": 42
71
+ },
72
+ {
73
+ "name": "positive",
74
+ "type": "CONDITIONING",
75
+ "link": 44
76
+ },
77
+ {
78
+ "name": "negative",
79
+ "type": "CONDITIONING",
80
+ "link": 45
81
+ },
82
+ {
83
+ "name": "latent_image",
84
+ "type": "LATENT",
85
+ "link": 58
86
+ }
87
+ ],
88
+ "outputs": [
89
+ {
90
+ "name": "LATENT",
91
+ "type": "LATENT",
92
+ "links": [
93
+ 48
94
+ ],
95
+ "shape": 3,
96
+ "slot_index": 0
97
+ }
98
+ ],
99
+ "properties": {
100
+ "Node name for S&R": "KSampler"
101
+ },
102
+ "widgets_values": [
103
+ 290872458059323,
104
+ "randomize",
105
+ 20,
106
+ 8,
107
+ "euler",
108
+ "normal",
109
+ 1
110
+ ]
111
+ },
112
+ {
113
+ "id": 45,
114
+ "type": "VAEDecode",
115
+ "pos": [
116
+ 1998,
117
+ 1018
118
+ ],
119
+ "size": {
120
+ "0": 210,
121
+ "1": 46
122
+ },
123
+ "flags": {},
124
+ "order": 7,
125
+ "mode": 0,
126
+ "inputs": [
127
+ {
128
+ "name": "samples",
129
+ "type": "LATENT",
130
+ "link": 48
131
+ },
132
+ {
133
+ "name": "vae",
134
+ "type": "VAE",
135
+ "link": 49
136
+ }
137
+ ],
138
+ "outputs": [
139
+ {
140
+ "name": "IMAGE",
141
+ "type": "IMAGE",
142
+ "links": [
143
+ 55
144
+ ],
145
+ "shape": 3,
146
+ "slot_index": 0
147
+ }
148
+ ],
149
+ "properties": {
150
+ "Node name for S&R": "VAEDecode"
151
+ }
152
+ },
153
+ {
154
+ "id": 48,
155
+ "type": "PreviewImage",
156
+ "pos": [
157
+ 2039,
158
+ 1262
159
+ ],
160
+ "size": {
161
+ "0": 210,
162
+ "1": 246
163
+ },
164
+ "flags": {},
165
+ "order": 8,
166
+ "mode": 0,
167
+ "inputs": [
168
+ {
169
+ "name": "images",
170
+ "type": "IMAGE",
171
+ "link": 55
172
+ }
173
+ ],
174
+ "properties": {
175
+ "Node name for S&R": "PreviewImage"
176
+ }
177
+ },
178
+ {
179
+ "id": 42,
180
+ "type": "CLIPTextEncode",
181
+ "pos": [
182
+ 1056,
183
+ 683
184
+ ],
185
+ "size": {
186
+ "0": 352.9139404296875,
187
+ "1": 113.16606140136719
188
+ },
189
+ "flags": {},
190
+ "order": 3,
191
+ "mode": 0,
192
+ "inputs": [
193
+ {
194
+ "name": "clip",
195
+ "type": "CLIP",
196
+ "link": 51
197
+ }
198
+ ],
199
+ "outputs": [
200
+ {
201
+ "name": "CONDITIONING",
202
+ "type": "CONDITIONING",
203
+ "links": [
204
+ 45
205
+ ],
206
+ "shape": 3,
207
+ "slot_index": 0
208
+ }
209
+ ],
210
+ "properties": {
211
+ "Node name for S&R": "CLIPTextEncode"
212
+ },
213
+ "widgets_values": [
214
+ "text, watermark"
215
+ ]
216
+ },
217
+ {
218
+ "id": 50,
219
+ "type": "VAEEncode",
220
+ "pos": [
221
+ 1119,
222
+ 1329
223
+ ],
224
+ "size": {
225
+ "0": 201.4841766357422,
226
+ "1": 55.59581756591797
227
+ },
228
+ "flags": {},
229
+ "order": 4,
230
+ "mode": 0,
231
+ "inputs": [
232
+ {
233
+ "name": "pixels",
234
+ "type": "IMAGE",
235
+ "link": 56
236
+ },
237
+ {
238
+ "name": "vae",
239
+ "type": "VAE",
240
+ "link": 57
241
+ }
242
+ ],
243
+ "outputs": [
244
+ {
245
+ "name": "LATENT",
246
+ "type": "LATENT",
247
+ "links": [
248
+ 58
249
+ ],
250
+ "shape": 3,
251
+ "slot_index": 0
252
+ }
253
+ ],
254
+ "properties": {
255
+ "Node name for S&R": "VAEEncode"
256
+ }
257
+ },
258
+ {
259
+ "id": 11,
260
+ "type": "LoadImage",
261
+ "pos": [
262
+ -135,
263
+ 907
264
+ ],
265
+ "size": {
266
+ "0": 670,
267
+ "1": 460
268
+ },
269
+ "flags": {},
270
+ "order": 0,
271
+ "mode": 0,
272
+ "outputs": [
273
+ {
274
+ "name": "IMAGE",
275
+ "type": "IMAGE",
276
+ "links": [
277
+ 56,
278
+ 59
279
+ ],
280
+ "shape": 3,
281
+ "slot_index": 0
282
+ },
283
+ {
284
+ "name": "MASK",
285
+ "type": "MASK",
286
+ "links": [],
287
+ "shape": 3,
288
+ "slot_index": 1
289
+ }
290
+ ],
291
+ "properties": {
292
+ "Node name for S&R": "LoadImage"
293
+ },
294
+ "widgets_values": [
295
+ "example.png",
296
+ "image"
297
+ ]
298
+ },
299
+ {
300
+ "id": 40,
301
+ "type": "CheckpointLoaderSimple",
302
+ "pos": [
303
+ 1124,
304
+ 1019
305
+ ],
306
+ "size": {
307
+ "0": 315,
308
+ "1": 98
309
+ },
310
+ "flags": {},
311
+ "order": 1,
312
+ "mode": 0,
313
+ "outputs": [
314
+ {
315
+ "name": "MODEL",
316
+ "type": "MODEL",
317
+ "links": [
318
+ 42
319
+ ],
320
+ "shape": 3,
321
+ "slot_index": 0
322
+ },
323
+ {
324
+ "name": "CLIP",
325
+ "type": "CLIP",
326
+ "links": [
327
+ 50,
328
+ 51
329
+ ],
330
+ "shape": 3,
331
+ "slot_index": 1
332
+ },
333
+ {
334
+ "name": "VAE",
335
+ "type": "VAE",
336
+ "links": [
337
+ 49,
338
+ 57
339
+ ],
340
+ "shape": 3,
341
+ "slot_index": 2
342
+ }
343
+ ],
344
+ "properties": {
345
+ "Node name for S&R": "CheckpointLoaderSimple"
346
+ },
347
+ "widgets_values": [
348
+ "dreamshaper_8.safetensors"
349
+ ]
350
+ },
351
+ {
352
+ "id": 51,
353
+ "type": "img2txt BLIP/Llava Multimodel Tagger",
354
+ "pos": [
355
+ 605,
356
+ 881
357
+ ],
358
+ "size": {
359
+ "0": 427.2057800292969,
360
+ "1": 476.26934814453125
361
+ },
362
+ "flags": {},
363
+ "order": 2,
364
+ "mode": 0,
365
+ "inputs": [
366
+ {
367
+ "name": "input_image",
368
+ "type": "IMAGE",
369
+ "link": 59
370
+ }
371
+ ],
372
+ "outputs": [
373
+ {
374
+ "name": "caption",
375
+ "type": "STRING",
376
+ "links": [
377
+ 60
378
+ ],
379
+ "shape": 3,
380
+ "slot_index": 0
381
+ }
382
+ ],
383
+ "properties": {
384
+ "Node name for S&R": "img2txt BLIP/Llava Multimodel Tagger"
385
+ },
386
+ "widgets_values": [
387
+ true,
388
+ false,
389
+ false,
390
+ false,
391
+ "a photograph of",
392
+ "What is the subject and background of this image?",
393
+ 0.7000000000000001,
394
+ 1.26,
395
+ 36,
396
+ 128,
397
+ 5,
398
+ "watermark, text, writing",
399
+ "a photograph of a girl dressed up, in pink dress and bright blue eyes poses in the grass with arms spread out in front of her face, holding an umbrella on a sky, "
400
+ ],
401
+ "color": "#322",
402
+ "bgcolor": "#533"
403
+ }
404
+ ],
405
+ "links": [
406
+ [
407
+ 42,
408
+ 40,
409
+ 0,
410
+ 39,
411
+ 0,
412
+ "MODEL"
413
+ ],
414
+ [
415
+ 44,
416
+ 41,
417
+ 0,
418
+ 39,
419
+ 1,
420
+ "CONDITIONING"
421
+ ],
422
+ [
423
+ 45,
424
+ 42,
425
+ 0,
426
+ 39,
427
+ 2,
428
+ "CONDITIONING"
429
+ ],
430
+ [
431
+ 48,
432
+ 39,
433
+ 0,
434
+ 45,
435
+ 0,
436
+ "LATENT"
437
+ ],
438
+ [
439
+ 49,
440
+ 40,
441
+ 2,
442
+ 45,
443
+ 1,
444
+ "VAE"
445
+ ],
446
+ [
447
+ 50,
448
+ 40,
449
+ 1,
450
+ 41,
451
+ 0,
452
+ "CLIP"
453
+ ],
454
+ [
455
+ 51,
456
+ 40,
457
+ 1,
458
+ 42,
459
+ 0,
460
+ "CLIP"
461
+ ],
462
+ [
463
+ 55,
464
+ 45,
465
+ 0,
466
+ 48,
467
+ 0,
468
+ "IMAGE"
469
+ ],
470
+ [
471
+ 56,
472
+ 11,
473
+ 0,
474
+ 50,
475
+ 0,
476
+ "IMAGE"
477
+ ],
478
+ [
479
+ 57,
480
+ 40,
481
+ 2,
482
+ 50,
483
+ 1,
484
+ "VAE"
485
+ ],
486
+ [
487
+ 58,
488
+ 50,
489
+ 0,
490
+ 39,
491
+ 3,
492
+ "LATENT"
493
+ ],
494
+ [
495
+ 59,
496
+ 11,
497
+ 0,
498
+ 51,
499
+ 0,
500
+ "IMAGE"
501
+ ],
502
+ [
503
+ 60,
504
+ 51,
505
+ 0,
506
+ 41,
507
+ 1,
508
+ "STRING"
509
+ ]
510
+ ],
511
+ "groups": [],
512
+ "config": {},
513
+ "extra": {
514
+ "ds": {
515
+ "scale": 0.9090909090909091,
516
+ "offset": {
517
+ "0": 304.575645264068,
518
+ "1": -258.56908735931404
519
+ }
520
+ }
521
+ },
522
+ "version": 0.4
523
+ }
ComfyUI/custom_nodes/img2txt-comfyui-nodes/wiki/workflow-examples/inpaint.json ADDED
@@ -0,0 +1,705 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "last_node_id": 61,
3
+ "last_link_id": 80,
4
+ "nodes": [
5
+ {
6
+ "id": 45,
7
+ "type": "VAEDecode",
8
+ "pos": [
9
+ 1998,
10
+ 1018
11
+ ],
12
+ "size": {
13
+ "0": 210,
14
+ "1": 46
15
+ },
16
+ "flags": {},
17
+ "order": 10,
18
+ "mode": 0,
19
+ "inputs": [
20
+ {
21
+ "name": "samples",
22
+ "type": "LATENT",
23
+ "link": 71
24
+ },
25
+ {
26
+ "name": "vae",
27
+ "type": "VAE",
28
+ "link": 49
29
+ }
30
+ ],
31
+ "outputs": [
32
+ {
33
+ "name": "IMAGE",
34
+ "type": "IMAGE",
35
+ "links": [
36
+ 55
37
+ ],
38
+ "shape": 3,
39
+ "slot_index": 0
40
+ }
41
+ ],
42
+ "properties": {
43
+ "Node name for S&R": "VAEDecode"
44
+ }
45
+ },
46
+ {
47
+ "id": 42,
48
+ "type": "CLIPTextEncode",
49
+ "pos": [
50
+ 1056,
51
+ 683
52
+ ],
53
+ "size": {
54
+ "0": 352.9139404296875,
55
+ "1": 113.16606140136719
56
+ },
57
+ "flags": {},
58
+ "order": 2,
59
+ "mode": 0,
60
+ "inputs": [
61
+ {
62
+ "name": "clip",
63
+ "type": "CLIP",
64
+ "link": 51
65
+ }
66
+ ],
67
+ "outputs": [
68
+ {
69
+ "name": "CONDITIONING",
70
+ "type": "CONDITIONING",
71
+ "links": [
72
+ 63
73
+ ],
74
+ "shape": 3,
75
+ "slot_index": 0
76
+ }
77
+ ],
78
+ "properties": {
79
+ "Node name for S&R": "CLIPTextEncode"
80
+ },
81
+ "widgets_values": [
82
+ "text, watermark"
83
+ ]
84
+ },
85
+ {
86
+ "id": 41,
87
+ "type": "CLIPTextEncode",
88
+ "pos": [
89
+ 1055,
90
+ 571
91
+ ],
92
+ "size": {
93
+ "0": 348.9403381347656,
94
+ "1": 56.439388275146484
95
+ },
96
+ "flags": {},
97
+ "order": 6,
98
+ "mode": 0,
99
+ "inputs": [
100
+ {
101
+ "name": "clip",
102
+ "type": "CLIP",
103
+ "link": 50
104
+ },
105
+ {
106
+ "name": "text",
107
+ "type": "STRING",
108
+ "link": 80,
109
+ "widget": {
110
+ "name": "text"
111
+ }
112
+ }
113
+ ],
114
+ "outputs": [
115
+ {
116
+ "name": "CONDITIONING",
117
+ "type": "CONDITIONING",
118
+ "links": [
119
+ 64
120
+ ],
121
+ "shape": 3,
122
+ "slot_index": 0
123
+ }
124
+ ],
125
+ "properties": {
126
+ "Node name for S&R": "CLIPTextEncode"
127
+ },
128
+ "widgets_values": [
129
+ ""
130
+ ]
131
+ },
132
+ {
133
+ "id": 58,
134
+ "type": "PreviewImage",
135
+ "pos": [
136
+ 616,
137
+ 1631
138
+ ],
139
+ "size": {
140
+ "0": 401.17840576171875,
141
+ "1": 246
142
+ },
143
+ "flags": {},
144
+ "order": 7,
145
+ "mode": 0,
146
+ "inputs": [
147
+ {
148
+ "name": "images",
149
+ "type": "IMAGE",
150
+ "link": 73
151
+ }
152
+ ],
153
+ "properties": {
154
+ "Node name for S&R": "PreviewImage"
155
+ }
156
+ },
157
+ {
158
+ "id": 57,
159
+ "type": "MaskToImage",
160
+ "pos": [
161
+ 617,
162
+ 1543
163
+ ],
164
+ "size": {
165
+ "0": 210,
166
+ "1": 26
167
+ },
168
+ "flags": {},
169
+ "order": 5,
170
+ "mode": 0,
171
+ "inputs": [
172
+ {
173
+ "name": "mask",
174
+ "type": "MASK",
175
+ "link": 78
176
+ }
177
+ ],
178
+ "outputs": [
179
+ {
180
+ "name": "IMAGE",
181
+ "type": "IMAGE",
182
+ "links": [
183
+ 73
184
+ ],
185
+ "shape": 3,
186
+ "slot_index": 0
187
+ }
188
+ ],
189
+ "properties": {
190
+ "Node name for S&R": "MaskToImage"
191
+ }
192
+ },
193
+ {
194
+ "id": 40,
195
+ "type": "CheckpointLoaderSimple",
196
+ "pos": [
197
+ 1044,
198
+ 1032
199
+ ],
200
+ "size": {
201
+ "0": 315,
202
+ "1": 98
203
+ },
204
+ "flags": {},
205
+ "order": 0,
206
+ "mode": 0,
207
+ "outputs": [
208
+ {
209
+ "name": "MODEL",
210
+ "type": "MODEL",
211
+ "links": [
212
+ 68
213
+ ],
214
+ "shape": 3,
215
+ "slot_index": 0
216
+ },
217
+ {
218
+ "name": "CLIP",
219
+ "type": "CLIP",
220
+ "links": [
221
+ 50,
222
+ 51
223
+ ],
224
+ "shape": 3,
225
+ "slot_index": 1
226
+ },
227
+ {
228
+ "name": "VAE",
229
+ "type": "VAE",
230
+ "links": [
231
+ 49,
232
+ 69
233
+ ],
234
+ "shape": 3,
235
+ "slot_index": 2
236
+ }
237
+ ],
238
+ "properties": {
239
+ "Node name for S&R": "CheckpointLoaderSimple"
240
+ },
241
+ "widgets_values": [
242
+ "experience_70-inpainting.safetensors"
243
+ ]
244
+ },
245
+ {
246
+ "id": 48,
247
+ "type": "PreviewImage",
248
+ "pos": [
249
+ 2039,
250
+ 1262
251
+ ],
252
+ "size": {
253
+ "0": 295.2332458496094,
254
+ "1": 293.2945251464844
255
+ },
256
+ "flags": {},
257
+ "order": 11,
258
+ "mode": 0,
259
+ "inputs": [
260
+ {
261
+ "name": "images",
262
+ "type": "IMAGE",
263
+ "link": 55
264
+ }
265
+ ],
266
+ "properties": {
267
+ "Node name for S&R": "PreviewImage"
268
+ }
269
+ },
270
+ {
271
+ "id": 56,
272
+ "type": "KSampler",
273
+ "pos": [
274
+ 1642,
275
+ 820
276
+ ],
277
+ "size": {
278
+ "0": 315,
279
+ "1": 262
280
+ },
281
+ "flags": {},
282
+ "order": 9,
283
+ "mode": 0,
284
+ "inputs": [
285
+ {
286
+ "name": "model",
287
+ "type": "MODEL",
288
+ "link": 68
289
+ },
290
+ {
291
+ "name": "positive",
292
+ "type": "CONDITIONING",
293
+ "link": 66
294
+ },
295
+ {
296
+ "name": "negative",
297
+ "type": "CONDITIONING",
298
+ "link": 67
299
+ },
300
+ {
301
+ "name": "latent_image",
302
+ "type": "LATENT",
303
+ "link": 65
304
+ }
305
+ ],
306
+ "outputs": [
307
+ {
308
+ "name": "LATENT",
309
+ "type": "LATENT",
310
+ "links": [
311
+ 71
312
+ ],
313
+ "shape": 3,
314
+ "slot_index": 0
315
+ }
316
+ ],
317
+ "properties": {
318
+ "Node name for S&R": "KSampler"
319
+ },
320
+ "widgets_values": [
321
+ 492464952856155,
322
+ "randomize",
323
+ 30,
324
+ 7,
325
+ "dpmpp_2m_sde_gpu",
326
+ "normal",
327
+ 0.8
328
+ ]
329
+ },
330
+ {
331
+ "id": 55,
332
+ "type": "ImageColorToMask",
333
+ "pos": [
334
+ 610,
335
+ 1425
336
+ ],
337
+ "size": {
338
+ "0": 315,
339
+ "1": 58
340
+ },
341
+ "flags": {},
342
+ "order": 3,
343
+ "mode": 0,
344
+ "inputs": [
345
+ {
346
+ "name": "image",
347
+ "type": "IMAGE",
348
+ "link": 61
349
+ }
350
+ ],
351
+ "outputs": [
352
+ {
353
+ "name": "MASK",
354
+ "type": "MASK",
355
+ "links": [
356
+ 77,
357
+ 78
358
+ ],
359
+ "shape": 3,
360
+ "slot_index": 0
361
+ }
362
+ ],
363
+ "properties": {
364
+ "Node name for S&R": "ImageColorToMask"
365
+ },
366
+ "widgets_values": [
367
+ 6198527
368
+ ]
369
+ },
370
+ {
371
+ "id": 54,
372
+ "type": "InpaintModelConditioning",
373
+ "pos": [
374
+ 1289,
375
+ 1377
376
+ ],
377
+ "size": {
378
+ "0": 216.59999084472656,
379
+ "1": 106
380
+ },
381
+ "flags": {},
382
+ "order": 8,
383
+ "mode": 0,
384
+ "inputs": [
385
+ {
386
+ "name": "positive",
387
+ "type": "CONDITIONING",
388
+ "link": 64
389
+ },
390
+ {
391
+ "name": "negative",
392
+ "type": "CONDITIONING",
393
+ "link": 63
394
+ },
395
+ {
396
+ "name": "vae",
397
+ "type": "VAE",
398
+ "link": 69
399
+ },
400
+ {
401
+ "name": "pixels",
402
+ "type": "IMAGE",
403
+ "link": 70
404
+ },
405
+ {
406
+ "name": "mask",
407
+ "type": "MASK",
408
+ "link": 77
409
+ }
410
+ ],
411
+ "outputs": [
412
+ {
413
+ "name": "positive",
414
+ "type": "CONDITIONING",
415
+ "links": [
416
+ 66
417
+ ],
418
+ "shape": 3,
419
+ "slot_index": 0
420
+ },
421
+ {
422
+ "name": "negative",
423
+ "type": "CONDITIONING",
424
+ "links": [
425
+ 67
426
+ ],
427
+ "shape": 3,
428
+ "slot_index": 1
429
+ },
430
+ {
431
+ "name": "latent",
432
+ "type": "LATENT",
433
+ "links": [
434
+ 65
435
+ ],
436
+ "shape": 3,
437
+ "slot_index": 2
438
+ }
439
+ ],
440
+ "properties": {
441
+ "Node name for S&R": "InpaintModelConditioning"
442
+ }
443
+ },
444
+ {
445
+ "id": 11,
446
+ "type": "LoadImage",
447
+ "pos": [
448
+ -135,
449
+ 907
450
+ ],
451
+ "size": {
452
+ "0": 670,
453
+ "1": 460
454
+ },
455
+ "flags": {},
456
+ "order": 1,
457
+ "mode": 0,
458
+ "outputs": [
459
+ {
460
+ "name": "IMAGE",
461
+ "type": "IMAGE",
462
+ "links": [
463
+ 61,
464
+ 70,
465
+ 79
466
+ ],
467
+ "shape": 3,
468
+ "slot_index": 0
469
+ },
470
+ {
471
+ "name": "MASK",
472
+ "type": "MASK",
473
+ "links": [],
474
+ "shape": 3,
475
+ "slot_index": 1
476
+ }
477
+ ],
478
+ "properties": {
479
+ "Node name for S&R": "LoadImage"
480
+ },
481
+ "widgets_values": [
482
+ "example.png",
483
+ "image"
484
+ ]
485
+ },
486
+ {
487
+ "id": 61,
488
+ "type": "img2txt BLIP/Llava Multimodel Tagger",
489
+ "pos": [
490
+ 599,
491
+ 886
492
+ ],
493
+ "size": [
494
+ 414.8329491017887,
495
+ 453.3791344354013
496
+ ],
497
+ "flags": {},
498
+ "order": 4,
499
+ "mode": 0,
500
+ "inputs": [
501
+ {
502
+ "name": "input_image",
503
+ "type": "IMAGE",
504
+ "link": 79
505
+ }
506
+ ],
507
+ "outputs": [
508
+ {
509
+ "name": "caption",
510
+ "type": "STRING",
511
+ "links": [
512
+ 80
513
+ ],
514
+ "shape": 3,
515
+ "slot_index": 0
516
+ }
517
+ ],
518
+ "properties": {
519
+ "Node name for S&R": "img2txt BLIP/Llava Multimodel Tagger"
520
+ },
521
+ "widgets_values": [
522
+ true,
523
+ false,
524
+ false,
525
+ false,
526
+ "a photograph of",
527
+ "What is the subject of this image?\n",
528
+ 0.8,
529
+ 1.2,
530
+ 36,
531
+ 128,
532
+ 5,
533
+ "watermark, text, writing"
534
+ ],
535
+ "color": "#322",
536
+ "bgcolor": "#533"
537
+ }
538
+ ],
539
+ "links": [
540
+ [
541
+ 49,
542
+ 40,
543
+ 2,
544
+ 45,
545
+ 1,
546
+ "VAE"
547
+ ],
548
+ [
549
+ 50,
550
+ 40,
551
+ 1,
552
+ 41,
553
+ 0,
554
+ "CLIP"
555
+ ],
556
+ [
557
+ 51,
558
+ 40,
559
+ 1,
560
+ 42,
561
+ 0,
562
+ "CLIP"
563
+ ],
564
+ [
565
+ 55,
566
+ 45,
567
+ 0,
568
+ 48,
569
+ 0,
570
+ "IMAGE"
571
+ ],
572
+ [
573
+ 61,
574
+ 11,
575
+ 0,
576
+ 55,
577
+ 0,
578
+ "IMAGE"
579
+ ],
580
+ [
581
+ 63,
582
+ 42,
583
+ 0,
584
+ 54,
585
+ 1,
586
+ "CONDITIONING"
587
+ ],
588
+ [
589
+ 64,
590
+ 41,
591
+ 0,
592
+ 54,
593
+ 0,
594
+ "CONDITIONING"
595
+ ],
596
+ [
597
+ 65,
598
+ 54,
599
+ 2,
600
+ 56,
601
+ 3,
602
+ "LATENT"
603
+ ],
604
+ [
605
+ 66,
606
+ 54,
607
+ 0,
608
+ 56,
609
+ 1,
610
+ "CONDITIONING"
611
+ ],
612
+ [
613
+ 67,
614
+ 54,
615
+ 1,
616
+ 56,
617
+ 2,
618
+ "CONDITIONING"
619
+ ],
620
+ [
621
+ 68,
622
+ 40,
623
+ 0,
624
+ 56,
625
+ 0,
626
+ "MODEL"
627
+ ],
628
+ [
629
+ 69,
630
+ 40,
631
+ 2,
632
+ 54,
633
+ 2,
634
+ "VAE"
635
+ ],
636
+ [
637
+ 70,
638
+ 11,
639
+ 0,
640
+ 54,
641
+ 3,
642
+ "IMAGE"
643
+ ],
644
+ [
645
+ 71,
646
+ 56,
647
+ 0,
648
+ 45,
649
+ 0,
650
+ "LATENT"
651
+ ],
652
+ [
653
+ 73,
654
+ 57,
655
+ 0,
656
+ 58,
657
+ 0,
658
+ "IMAGE"
659
+ ],
660
+ [
661
+ 77,
662
+ 55,
663
+ 0,
664
+ 54,
665
+ 4,
666
+ "MASK"
667
+ ],
668
+ [
669
+ 78,
670
+ 55,
671
+ 0,
672
+ 57,
673
+ 0,
674
+ "MASK"
675
+ ],
676
+ [
677
+ 79,
678
+ 11,
679
+ 0,
680
+ 61,
681
+ 0,
682
+ "IMAGE"
683
+ ],
684
+ [
685
+ 80,
686
+ 61,
687
+ 0,
688
+ 41,
689
+ 1,
690
+ "STRING"
691
+ ]
692
+ ],
693
+ "groups": [],
694
+ "config": {},
695
+ "extra": {
696
+ "ds": {
697
+ "scale": 0.8264462809917354,
698
+ "offset": {
699
+ "0": 478.9515963527572,
700
+ "1": -472.76124333876595
701
+ }
702
+ }
703
+ },
704
+ "version": 0.4
705
+ }
ComfyUI/custom_nodes/img2txt-comfyui-nodes/wiki/workflow-examples/txt2img.json ADDED
@@ -0,0 +1,498 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "last_node_id": 53,
3
+ "last_link_id": 61,
4
+ "nodes": [
5
+ {
6
+ "id": 41,
7
+ "type": "CLIPTextEncode",
8
+ "pos": [
9
+ 1055,
10
+ 571
11
+ ],
12
+ "size": {
13
+ "0": 348.9403381347656,
14
+ "1": 56.439388275146484
15
+ },
16
+ "flags": {},
17
+ "order": 5,
18
+ "mode": 0,
19
+ "inputs": [
20
+ {
21
+ "name": "clip",
22
+ "type": "CLIP",
23
+ "link": 50
24
+ },
25
+ {
26
+ "name": "text",
27
+ "type": "STRING",
28
+ "link": 61,
29
+ "widget": {
30
+ "name": "text"
31
+ }
32
+ }
33
+ ],
34
+ "outputs": [
35
+ {
36
+ "name": "CONDITIONING",
37
+ "type": "CONDITIONING",
38
+ "links": [
39
+ 44
40
+ ],
41
+ "shape": 3,
42
+ "slot_index": 0
43
+ }
44
+ ],
45
+ "properties": {
46
+ "Node name for S&R": "CLIPTextEncode"
47
+ },
48
+ "widgets_values": [
49
+ ""
50
+ ]
51
+ },
52
+ {
53
+ "id": 39,
54
+ "type": "KSampler",
55
+ "pos": [
56
+ 1587,
57
+ 982
58
+ ],
59
+ "size": {
60
+ "0": 315,
61
+ "1": 262
62
+ },
63
+ "flags": {},
64
+ "order": 6,
65
+ "mode": 0,
66
+ "inputs": [
67
+ {
68
+ "name": "model",
69
+ "type": "MODEL",
70
+ "link": 42
71
+ },
72
+ {
73
+ "name": "positive",
74
+ "type": "CONDITIONING",
75
+ "link": 44
76
+ },
77
+ {
78
+ "name": "negative",
79
+ "type": "CONDITIONING",
80
+ "link": 45
81
+ },
82
+ {
83
+ "name": "latent_image",
84
+ "type": "LATENT",
85
+ "link": 59
86
+ }
87
+ ],
88
+ "outputs": [
89
+ {
90
+ "name": "LATENT",
91
+ "type": "LATENT",
92
+ "links": [
93
+ 48
94
+ ],
95
+ "shape": 3,
96
+ "slot_index": 0
97
+ }
98
+ ],
99
+ "properties": {
100
+ "Node name for S&R": "KSampler"
101
+ },
102
+ "widgets_values": [
103
+ 438454791536393,
104
+ "randomize",
105
+ 20,
106
+ 8,
107
+ "euler",
108
+ "normal",
109
+ 1
110
+ ]
111
+ },
112
+ {
113
+ "id": 45,
114
+ "type": "VAEDecode",
115
+ "pos": [
116
+ 1998,
117
+ 1018
118
+ ],
119
+ "size": {
120
+ "0": 210,
121
+ "1": 46
122
+ },
123
+ "flags": {},
124
+ "order": 7,
125
+ "mode": 0,
126
+ "inputs": [
127
+ {
128
+ "name": "samples",
129
+ "type": "LATENT",
130
+ "link": 48
131
+ },
132
+ {
133
+ "name": "vae",
134
+ "type": "VAE",
135
+ "link": 49
136
+ }
137
+ ],
138
+ "outputs": [
139
+ {
140
+ "name": "IMAGE",
141
+ "type": "IMAGE",
142
+ "links": [
143
+ 55
144
+ ],
145
+ "shape": 3,
146
+ "slot_index": 0
147
+ }
148
+ ],
149
+ "properties": {
150
+ "Node name for S&R": "VAEDecode"
151
+ }
152
+ },
153
+ {
154
+ "id": 48,
155
+ "type": "PreviewImage",
156
+ "pos": [
157
+ 2039,
158
+ 1262
159
+ ],
160
+ "size": {
161
+ "0": 210,
162
+ "1": 246
163
+ },
164
+ "flags": {},
165
+ "order": 8,
166
+ "mode": 0,
167
+ "inputs": [
168
+ {
169
+ "name": "images",
170
+ "type": "IMAGE",
171
+ "link": 55
172
+ }
173
+ ],
174
+ "properties": {
175
+ "Node name for S&R": "PreviewImage"
176
+ }
177
+ },
178
+ {
179
+ "id": 42,
180
+ "type": "CLIPTextEncode",
181
+ "pos": [
182
+ 1056,
183
+ 683
184
+ ],
185
+ "size": {
186
+ "0": 352.9139404296875,
187
+ "1": 113.16606140136719
188
+ },
189
+ "flags": {},
190
+ "order": 4,
191
+ "mode": 0,
192
+ "inputs": [
193
+ {
194
+ "name": "clip",
195
+ "type": "CLIP",
196
+ "link": 51
197
+ }
198
+ ],
199
+ "outputs": [
200
+ {
201
+ "name": "CONDITIONING",
202
+ "type": "CONDITIONING",
203
+ "links": [
204
+ 45
205
+ ],
206
+ "shape": 3,
207
+ "slot_index": 0
208
+ }
209
+ ],
210
+ "properties": {
211
+ "Node name for S&R": "CLIPTextEncode"
212
+ },
213
+ "widgets_values": [
214
+ "text, watermark"
215
+ ]
216
+ },
217
+ {
218
+ "id": 52,
219
+ "type": "EmptyLatentImage",
220
+ "pos": [
221
+ 1126,
222
+ 1189
223
+ ],
224
+ "size": {
225
+ "0": 315,
226
+ "1": 106
227
+ },
228
+ "flags": {},
229
+ "order": 0,
230
+ "mode": 0,
231
+ "outputs": [
232
+ {
233
+ "name": "LATENT",
234
+ "type": "LATENT",
235
+ "links": [
236
+ 59
237
+ ],
238
+ "shape": 3,
239
+ "slot_index": 0
240
+ }
241
+ ],
242
+ "properties": {
243
+ "Node name for S&R": "EmptyLatentImage"
244
+ },
245
+ "widgets_values": [
246
+ 512,
247
+ 512,
248
+ 1
249
+ ]
250
+ },
251
+ {
252
+ "id": 11,
253
+ "type": "LoadImage",
254
+ "pos": [
255
+ -135,
256
+ 907
257
+ ],
258
+ "size": {
259
+ "0": 670,
260
+ "1": 460
261
+ },
262
+ "flags": {},
263
+ "order": 1,
264
+ "mode": 0,
265
+ "outputs": [
266
+ {
267
+ "name": "IMAGE",
268
+ "type": "IMAGE",
269
+ "links": [
270
+ 60
271
+ ],
272
+ "shape": 3,
273
+ "slot_index": 0
274
+ },
275
+ {
276
+ "name": "MASK",
277
+ "type": "MASK",
278
+ "links": [],
279
+ "shape": 3,
280
+ "slot_index": 1
281
+ }
282
+ ],
283
+ "properties": {
284
+ "Node name for S&R": "LoadImage"
285
+ },
286
+ "widgets_values": [
287
+ "example.png",
288
+ "image"
289
+ ]
290
+ },
291
+ {
292
+ "id": 40,
293
+ "type": "CheckpointLoaderSimple",
294
+ "pos": [
295
+ 1124,
296
+ 1019
297
+ ],
298
+ "size": {
299
+ "0": 315,
300
+ "1": 98
301
+ },
302
+ "flags": {},
303
+ "order": 2,
304
+ "mode": 0,
305
+ "outputs": [
306
+ {
307
+ "name": "MODEL",
308
+ "type": "MODEL",
309
+ "links": [
310
+ 42
311
+ ],
312
+ "shape": 3,
313
+ "slot_index": 0
314
+ },
315
+ {
316
+ "name": "CLIP",
317
+ "type": "CLIP",
318
+ "links": [
319
+ 50,
320
+ 51
321
+ ],
322
+ "shape": 3,
323
+ "slot_index": 1
324
+ },
325
+ {
326
+ "name": "VAE",
327
+ "type": "VAE",
328
+ "links": [
329
+ 49
330
+ ],
331
+ "shape": 3,
332
+ "slot_index": 2
333
+ }
334
+ ],
335
+ "properties": {
336
+ "Node name for S&R": "CheckpointLoaderSimple"
337
+ },
338
+ "widgets_values": [
339
+ "dreamshaper_8.safetensors"
340
+ ]
341
+ },
342
+ {
343
+ "id": 53,
344
+ "type": "img2txt BLIP/Llava Multimodel Tagger",
345
+ "pos": [
346
+ 584,
347
+ 865
348
+ ],
349
+ "size": [
350
+ 462.2727684830322,
351
+ 532.8236759410865
352
+ ],
353
+ "flags": {},
354
+ "order": 3,
355
+ "mode": 0,
356
+ "inputs": [
357
+ {
358
+ "name": "input_image",
359
+ "type": "IMAGE",
360
+ "link": 60
361
+ }
362
+ ],
363
+ "outputs": [
364
+ {
365
+ "name": "caption",
366
+ "type": "STRING",
367
+ "links": [
368
+ 61
369
+ ],
370
+ "shape": 3,
371
+ "slot_index": 0
372
+ }
373
+ ],
374
+ "properties": {
375
+ "Node name for S&R": "img2txt BLIP/Llava Multimodel Tagger"
376
+ },
377
+ "widgets_values": [
378
+ false,
379
+ false,
380
+ true,
381
+ false,
382
+ "a photograph of",
383
+ "What is a detailed description of this image?\nWhat is the background of this image?",
384
+ 0.8,
385
+ 1.2,
386
+ 36,
387
+ 128,
388
+ 5,
389
+ "watermark, text, writing",
390
+ "The image features a cartoon character standing against an abstract background consisting of green, blue, and white elements. The main focus is on the woman with bright yellow wings wearing pink attire while smiling at something off-frame in front of her that seems to be representing \"clouds\" or possibly another object within view but not clearly visible due to its distance from us as viewers., "
391
+ ],
392
+ "color": "#322",
393
+ "bgcolor": "#533"
394
+ }
395
+ ],
396
+ "links": [
397
+ [
398
+ 42,
399
+ 40,
400
+ 0,
401
+ 39,
402
+ 0,
403
+ "MODEL"
404
+ ],
405
+ [
406
+ 44,
407
+ 41,
408
+ 0,
409
+ 39,
410
+ 1,
411
+ "CONDITIONING"
412
+ ],
413
+ [
414
+ 45,
415
+ 42,
416
+ 0,
417
+ 39,
418
+ 2,
419
+ "CONDITIONING"
420
+ ],
421
+ [
422
+ 48,
423
+ 39,
424
+ 0,
425
+ 45,
426
+ 0,
427
+ "LATENT"
428
+ ],
429
+ [
430
+ 49,
431
+ 40,
432
+ 2,
433
+ 45,
434
+ 1,
435
+ "VAE"
436
+ ],
437
+ [
438
+ 50,
439
+ 40,
440
+ 1,
441
+ 41,
442
+ 0,
443
+ "CLIP"
444
+ ],
445
+ [
446
+ 51,
447
+ 40,
448
+ 1,
449
+ 42,
450
+ 0,
451
+ "CLIP"
452
+ ],
453
+ [
454
+ 55,
455
+ 45,
456
+ 0,
457
+ 48,
458
+ 0,
459
+ "IMAGE"
460
+ ],
461
+ [
462
+ 59,
463
+ 52,
464
+ 0,
465
+ 39,
466
+ 3,
467
+ "LATENT"
468
+ ],
469
+ [
470
+ 60,
471
+ 11,
472
+ 0,
473
+ 53,
474
+ 0,
475
+ "IMAGE"
476
+ ],
477
+ [
478
+ 61,
479
+ 53,
480
+ 0,
481
+ 41,
482
+ 1,
483
+ "STRING"
484
+ ]
485
+ ],
486
+ "groups": [],
487
+ "config": {},
488
+ "extra": {
489
+ "ds": {
490
+ "scale": 0.9090909090909091,
491
+ "offset": {
492
+ "0": 278.52736579431155,
493
+ "1": -323.6237095104226
494
+ }
495
+ }
496
+ },
497
+ "version": 0.4
498
+ }