John6666 commited on
Commit
08b4b80
1 Parent(s): db519e8

Upload 2 files

Browse files

Simply prevent crashes when environment variables are not set. Intended primarily for debugging and duplication to CPU space.

Files changed (2) hide show
  1. caption_models.py +191 -184
  2. huggingface_inference_node.py +170 -162
caption_models.py CHANGED
@@ -1,185 +1,192 @@
1
- import spaces
2
- import torch
3
- from PIL import Image
4
- from transformers import AutoProcessor, AutoModelForCausalLM, Qwen2VLForConditionalGeneration, AutoModel, AutoTokenizer, AutoModelForCausalLM
5
- from qwen_vl_utils import process_vision_info
6
- import numpy as np
7
- import os
8
- from datetime import datetime
9
- import subprocess
10
- import torch.nn as nn
11
-
12
- subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
13
-
14
- device = "cuda" if torch.cuda.is_available() else "cpu"
15
-
16
- HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN", None)
17
-
18
- # Initialize Florence model
19
- florence_model = AutoModelForCausalLM.from_pretrained('microsoft/Florence-2-large', trust_remote_code=True).to(device).eval()
20
- florence_processor = AutoProcessor.from_pretrained('microsoft/Florence-2-large', trust_remote_code=True)
21
-
22
- # Initialize Qwen2-VL-2B model
23
- qwen_model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", trust_remote_code=True, torch_dtype="auto").to(device).eval()
24
- qwen_processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", trust_remote_code=True)
25
-
26
- # Add these new imports and constants
27
- CLIP_PATH = "google/siglip-so400m-patch14-384"
28
- VLM_PROMPT = "A descriptive caption for this image:\n"
29
- MODEL_PATH = "meta-llama/Meta-Llama-3.1-8B"
30
- CHECKPOINT_PATH = "wpkklhc6"
31
-
32
- class ImageAdapter(nn.Module):
33
- def __init__(self, input_features: int, output_features: int):
34
- super().__init__()
35
- self.linear1 = nn.Linear(input_features, output_features)
36
- self.activation = nn.GELU()
37
- self.linear2 = nn.Linear(output_features, output_features)
38
-
39
- def forward(self, vision_outputs: torch.Tensor):
40
- x = self.linear1(vision_outputs)
41
- x = self.activation(x)
42
- x = self.linear2(x)
43
- return x
44
-
45
- # Load CLIP
46
- clip_processor = AutoProcessor.from_pretrained(CLIP_PATH)
47
- clip_model = AutoModel.from_pretrained(CLIP_PATH).vision_model
48
- clip_model.eval()
49
- clip_model.requires_grad_(False)
50
- clip_model.to(device)
51
-
52
- # Tokenizer
53
- tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False, token=HF_TOKEN)
54
-
55
- # LLM
56
- text_model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="auto", torch_dtype=torch.bfloat16, token=HF_TOKEN)
57
- text_model.eval()
58
-
59
- # Image Adapter
60
- image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size)
61
- image_adapter.load_state_dict(torch.load(f"{CHECKPOINT_PATH}/image_adapter.pt", map_location="cpu"))
62
- image_adapter.eval()
63
- image_adapter.to(device)
64
-
65
- @spaces.GPU
66
- def florence_caption(image):
67
- if not isinstance(image, Image.Image):
68
- image = Image.fromarray(image)
69
-
70
- inputs = florence_processor(text="<MORE_DETAILED_CAPTION>", images=image, return_tensors="pt").to(device)
71
- generated_ids = florence_model.generate(
72
- input_ids=inputs["input_ids"],
73
- pixel_values=inputs["pixel_values"],
74
- max_new_tokens=1024,
75
- early_stopping=False,
76
- do_sample=False,
77
- num_beams=3,
78
- )
79
- generated_text = florence_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
80
- parsed_answer = florence_processor.post_process_generation(
81
- generated_text,
82
- task="<MORE_DETAILED_CAPTION>",
83
- image_size=(image.width, image.height)
84
- )
85
- return parsed_answer["<MORE_DETAILED_CAPTION>"]
86
-
87
- def array_to_image_path(image_array):
88
- img = Image.fromarray(np.uint8(image_array))
89
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
90
- filename = f"image_{timestamp}.png"
91
- img.save(filename)
92
- full_path = os.path.abspath(filename)
93
- return full_path
94
-
95
- @spaces.GPU
96
- def qwen_caption(image):
97
- if not isinstance(image, Image.Image):
98
- image = Image.fromarray(np.uint8(image))
99
-
100
- image_path = array_to_image_path(np.array(image))
101
-
102
- messages = [
103
- {
104
- "role": "user",
105
- "content": [
106
- {
107
- "type": "image",
108
- "image": image_path,
109
- },
110
- {"type": "text", "text": "Describe this image in great detail in one paragraph."},
111
- ],
112
- }
113
- ]
114
-
115
- text = qwen_processor.apply_chat_template(
116
- messages, tokenize=False, add_generation_prompt=True
117
- )
118
- image_inputs, video_inputs = process_vision_info(messages)
119
- inputs = qwen_processor(
120
- text=[text],
121
- images=image_inputs,
122
- videos=video_inputs,
123
- padding=True,
124
- return_tensors="pt",
125
- )
126
- inputs = inputs.to(device)
127
-
128
- generated_ids = qwen_model.generate(**inputs, max_new_tokens=256)
129
- generated_ids_trimmed = [
130
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
131
- ]
132
- output_text = qwen_processor.batch_decode(
133
- generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
134
- )
135
-
136
- return output_text[0]
137
-
138
- @spaces.GPU
139
- @torch.no_grad()
140
- def joycaption(image):
141
- if not isinstance(image, Image.Image):
142
- image = Image.fromarray(np.uint8(image))
143
-
144
- # Preprocess image
145
- image = clip_processor(images=image, return_tensors='pt').pixel_values
146
- image = image.to(device)
147
-
148
- # Tokenize the prompt
149
- prompt = tokenizer.encode(VLM_PROMPT, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
150
-
151
- # Embed image
152
- with torch.amp.autocast_mode.autocast(device_type='cuda', enabled=True):
153
- vision_outputs = clip_model(pixel_values=image, output_hidden_states=True)
154
- image_features = vision_outputs.hidden_states[-2]
155
- embedded_images = image_adapter(image_features)
156
- embedded_images = embedded_images.to(device)
157
-
158
- # Embed prompt
159
- prompt_embeds = text_model.model.embed_tokens(prompt.to(device))
160
- embedded_bos = text_model.model.embed_tokens(torch.tensor([[tokenizer.bos_token_id]], device=device, dtype=torch.int64))
161
-
162
- # Construct prompts
163
- inputs_embeds = torch.cat([
164
- embedded_bos.expand(embedded_images.shape[0], -1, -1),
165
- embedded_images.to(dtype=embedded_bos.dtype),
166
- prompt_embeds.expand(embedded_images.shape[0], -1, -1),
167
- ], dim=1)
168
-
169
- input_ids = torch.cat([
170
- torch.tensor([[tokenizer.bos_token_id]], dtype=torch.long),
171
- torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
172
- prompt,
173
- ], dim=1).to(device)
174
- attention_mask = torch.ones_like(input_ids)
175
-
176
- generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=True, top_k=10, temperature=0.5, suppress_tokens=None)
177
-
178
- # Trim off the prompt
179
- generate_ids = generate_ids[:, input_ids.shape[1]:]
180
- if generate_ids[0][-1] == tokenizer.eos_token_id:
181
- generate_ids = generate_ids[:, :-1]
182
-
183
- caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
184
-
 
 
 
 
 
 
 
185
  return caption.strip()
 
1
+ import spaces
2
+ import torch
3
+ from PIL import Image
4
+ from transformers import AutoProcessor, AutoModelForCausalLM, Qwen2VLForConditionalGeneration, AutoModel, AutoTokenizer, AutoModelForCausalLM
5
+ from qwen_vl_utils import process_vision_info
6
+ import numpy as np
7
+ import os
8
+ from datetime import datetime
9
+ import subprocess
10
+ import torch.nn as nn
11
+
12
+ subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
13
+
14
+ device = "cuda" if torch.cuda.is_available() else "cpu"
15
+
16
+ HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN", None)
17
+
18
+ # Initialize Florence model
19
+ florence_model = AutoModelForCausalLM.from_pretrained('microsoft/Florence-2-large', trust_remote_code=True).to(device).eval()
20
+ florence_processor = AutoProcessor.from_pretrained('microsoft/Florence-2-large', trust_remote_code=True)
21
+
22
+ # Initialize Qwen2-VL-2B model
23
+ qwen_model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", trust_remote_code=True, torch_dtype="auto").to(device).eval()
24
+ qwen_processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", trust_remote_code=True)
25
+
26
+ # Add these new imports and constants
27
+ CLIP_PATH = "google/siglip-so400m-patch14-384"
28
+ VLM_PROMPT = "A descriptive caption for this image:\n"
29
+ MODEL_PATH = "meta-llama/Meta-Llama-3.1-8B"
30
+ CHECKPOINT_PATH = "wpkklhc6"
31
+
32
+ class ImageAdapter(nn.Module):
33
+ def __init__(self, input_features: int, output_features: int):
34
+ super().__init__()
35
+ self.linear1 = nn.Linear(input_features, output_features)
36
+ self.activation = nn.GELU()
37
+ self.linear2 = nn.Linear(output_features, output_features)
38
+
39
+ def forward(self, vision_outputs: torch.Tensor):
40
+ x = self.linear1(vision_outputs)
41
+ x = self.activation(x)
42
+ x = self.linear2(x)
43
+ return x
44
+
45
+ # Load CLIP
46
+ clip_processor = AutoProcessor.from_pretrained(CLIP_PATH)
47
+ clip_model = AutoModel.from_pretrained(CLIP_PATH).vision_model
48
+ clip_model.eval()
49
+ clip_model.requires_grad_(False)
50
+ clip_model.to(device)
51
+
52
+ if HF_TOKEN is not None:
53
+ # Tokenizer
54
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False, token=HF_TOKEN)
55
+
56
+ # LLM
57
+ text_model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="auto", torch_dtype=torch.bfloat16, token=HF_TOKEN)
58
+ text_model.eval()
59
+
60
+ # Image Adapter
61
+ image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size)
62
+ image_adapter.load_state_dict(torch.load(f"{CHECKPOINT_PATH}/image_adapter.pt", map_location="cpu"))
63
+ image_adapter.eval()
64
+ image_adapter.to(device)
65
+ else:
66
+ print("HUGGINGFACE_TOKEN is not set.")
67
+ tokenizer = None
68
+ text_model = None
69
+ image_adapter = None
70
+
71
+ @spaces.GPU
72
+ def florence_caption(image):
73
+ if not isinstance(image, Image.Image):
74
+ image = Image.fromarray(image)
75
+
76
+ inputs = florence_processor(text="<MORE_DETAILED_CAPTION>", images=image, return_tensors="pt").to(device)
77
+ generated_ids = florence_model.generate(
78
+ input_ids=inputs["input_ids"],
79
+ pixel_values=inputs["pixel_values"],
80
+ max_new_tokens=1024,
81
+ early_stopping=False,
82
+ do_sample=False,
83
+ num_beams=3,
84
+ )
85
+ generated_text = florence_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
86
+ parsed_answer = florence_processor.post_process_generation(
87
+ generated_text,
88
+ task="<MORE_DETAILED_CAPTION>",
89
+ image_size=(image.width, image.height)
90
+ )
91
+ return parsed_answer["<MORE_DETAILED_CAPTION>"]
92
+
93
+ def array_to_image_path(image_array):
94
+ img = Image.fromarray(np.uint8(image_array))
95
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
96
+ filename = f"image_{timestamp}.png"
97
+ img.save(filename)
98
+ full_path = os.path.abspath(filename)
99
+ return full_path
100
+
101
+ @spaces.GPU
102
+ def qwen_caption(image):
103
+ if not isinstance(image, Image.Image):
104
+ image = Image.fromarray(np.uint8(image))
105
+
106
+ image_path = array_to_image_path(np.array(image))
107
+
108
+ messages = [
109
+ {
110
+ "role": "user",
111
+ "content": [
112
+ {
113
+ "type": "image",
114
+ "image": image_path,
115
+ },
116
+ {"type": "text", "text": "Describe this image in great detail in one paragraph."},
117
+ ],
118
+ }
119
+ ]
120
+
121
+ text = qwen_processor.apply_chat_template(
122
+ messages, tokenize=False, add_generation_prompt=True
123
+ )
124
+ image_inputs, video_inputs = process_vision_info(messages)
125
+ inputs = qwen_processor(
126
+ text=[text],
127
+ images=image_inputs,
128
+ videos=video_inputs,
129
+ padding=True,
130
+ return_tensors="pt",
131
+ )
132
+ inputs = inputs.to(device)
133
+
134
+ generated_ids = qwen_model.generate(**inputs, max_new_tokens=256)
135
+ generated_ids_trimmed = [
136
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
137
+ ]
138
+ output_text = qwen_processor.batch_decode(
139
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
140
+ )
141
+
142
+ return output_text[0]
143
+
144
+ @spaces.GPU
145
+ @torch.no_grad()
146
+ def joycaption(image):
147
+ if text_model is None: return ""
148
+ if not isinstance(image, Image.Image):
149
+ image = Image.fromarray(np.uint8(image))
150
+
151
+ # Preprocess image
152
+ image = clip_processor(images=image, return_tensors='pt').pixel_values
153
+ image = image.to(device)
154
+
155
+ # Tokenize the prompt
156
+ prompt = tokenizer.encode(VLM_PROMPT, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
157
+
158
+ # Embed image
159
+ with torch.amp.autocast_mode.autocast(device_type='cuda', enabled=True):
160
+ vision_outputs = clip_model(pixel_values=image, output_hidden_states=True)
161
+ image_features = vision_outputs.hidden_states[-2]
162
+ embedded_images = image_adapter(image_features)
163
+ embedded_images = embedded_images.to(device)
164
+
165
+ # Embed prompt
166
+ prompt_embeds = text_model.model.embed_tokens(prompt.to(device))
167
+ embedded_bos = text_model.model.embed_tokens(torch.tensor([[tokenizer.bos_token_id]], device=device, dtype=torch.int64))
168
+
169
+ # Construct prompts
170
+ inputs_embeds = torch.cat([
171
+ embedded_bos.expand(embedded_images.shape[0], -1, -1),
172
+ embedded_images.to(dtype=embedded_bos.dtype),
173
+ prompt_embeds.expand(embedded_images.shape[0], -1, -1),
174
+ ], dim=1)
175
+
176
+ input_ids = torch.cat([
177
+ torch.tensor([[tokenizer.bos_token_id]], dtype=torch.long),
178
+ torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
179
+ prompt,
180
+ ], dim=1).to(device)
181
+ attention_mask = torch.ones_like(input_ids)
182
+
183
+ generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=True, top_k=10, temperature=0.5, suppress_tokens=None)
184
+
185
+ # Trim off the prompt
186
+ generate_ids = generate_ids[:, input_ids.shape[1]:]
187
+ if generate_ids[0][-1] == tokenizer.eos_token_id:
188
+ generate_ids = generate_ids[:, :-1]
189
+
190
+ caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
191
+
192
  return caption.strip()
huggingface_inference_node.py CHANGED
@@ -1,162 +1,170 @@
1
- import os
2
- import re
3
- from datetime import datetime
4
-
5
- import anthropic
6
- from groq import Groq
7
- from openai import OpenAI
8
-
9
- huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
10
- groq_api_key = os.getenv("GROQ_API_KEY")
11
-
12
-
13
- class LLMInferenceNode:
14
- def __init__(self):
15
- self.huggingface_client = OpenAI(
16
- base_url="https://api-inference.huggingface.co/v1/",
17
- api_key=huggingface_token,
18
- )
19
- self.groq_client = Groq(api_key=groq_api_key)
20
-
21
- def generate(
22
- self,
23
- input_text,
24
- happy_talk,
25
- compress,
26
- compression_level,
27
- poster,
28
- prompt_type,
29
- custom_base_prompt="",
30
- provider="Hugging Face",
31
- api_key=None,
32
- model=None,
33
- ):
34
- try:
35
- default_happy_prompt = """Create a detailed visually descriptive caption of this description, which will be used as a prompt for a text to image AI system (caption only, no instructions like "create an image").Remove any mention of digital artwork or artwork style. Give detailed visual descriptions of the character(s), including ethnicity, skin tone, expression etc. Imagine using keywords for a still for someone who has aphantasia. Describe the image style, e.g. any photographic or art styles / techniques utilized. Make sure to fully describe all aspects of the cinematography, with abundant technical details and visual descriptions. If there is more than one image, combine the elements and characters from all of the images creatively into a single cohesive composition with a single background, inventing an interaction between the characters. Be creative in combining the characters into a single cohesive scene. Focus on two primary characters (or one) and describe an interesting interaction between them, such as a hug, a kiss, a fight, giving an object, an emotional reaction / interaction. If there is more than one background in the images, pick the most appropriate one. Your output is only the caption itself, no comments or extra formatting. The caption is in a single long paragraph. If you feel the images are inappropriate, invent a new scene / characters inspired by these. Additionally, incorporate a specific movie director's visual style and describe the lighting setup in detail, including the type, color, and placement of light sources to create the desired mood and atmosphere. Always frame the scene, including details about the film grain, color grading, and any artifacts or characteristics specific."""
36
-
37
- default_simple_prompt = """Create a brief, straightforward caption for this description, suitable for a text-to-image AI system. Focus on the main elements, key characters, and overall scene without elaborate details. Provide a clear and concise description in one or two sentences. Your output is only the caption itself, no comments or extra formatting. The caption is in a single long paragraph."""
38
-
39
- poster_prompt = """Analyze the provided description and extract key information to create a movie poster style description. Format the output as follows:
40
- Title: A catchy, intriguing title that captures the essence of the scene, place the title in "".
41
- Main character: Give a description of the main character.
42
- Background: Describe the background in detail.
43
- Supporting characters: Describe the supporting characters
44
- Branding type: Describe the branding type
45
- Tagline: Include a tagline that captures the essence of the movie.
46
- Visual style: Ensure that the visual style fits the branding type and tagline.
47
- You are allowed to make up film and branding names, and do them like 80's, 90's or modern movie posters.Your output is only the caption itself, no comments or extra formatting. The caption is in a single long paragraph."""
48
-
49
- only_objects_prompt = """Create a highly detailed and visually rich description focusing solely on inanimate objects, without including any human or animal figures. Describe the objects' shapes, sizes, colors, textures, and materials in great detail. Pay attention to their arrangement, positioning, and how they interact with light and shadow. Include information about the setting or environment these objects are in, such as indoor/outdoor, time of day, weather conditions, and any atmospheric effects. Mention any unique features, patterns, or imperfections on the objects. Describe the overall composition, perspective, and any artistic techniques that might be employed to render these objects (e.g., photorealism, impressionistic style, etc.). Your description should paint a vivid picture that allows someone to imagine the scene without seeing it, focusing on the beauty, complexity, or significance of everyday objects. Your output is only the caption itself, no comments or extra formatting. The caption is in a single long paragraph."""
50
-
51
- no_figure_prompt = """Generate a comprehensive and visually evocative description of a scene or landscape without including any human or animal figures. Focus on the environment, natural elements, and man-made structures if present. Describe the topography, vegetation, weather conditions, and time of day in great detail. Pay attention to colors, textures, and how light interacts with different elements of the scene. If there are buildings or other structures, describe their architecture, condition, and how they fit into the landscape. Include sensory details beyond just visual elements - mention sounds, smells, and the overall atmosphere or mood of the scene. Describe any notable features like bodies of water, geological formations, or sky phenomena. Consider the perspective from which the scene is viewed and how this affects the composition. Your description should transport the reader to this location, allowing them to vividly imagine the scene without any living subjects present. Your output is only the caption itself, no comments or extra formatting. The caption is in a single long paragraph."""
52
-
53
- landscape_prompt = """Create an immersive and detailed description of a landscape, focusing on its natural beauty and geographical features. Begin with the overall topography - is it mountainous, coastal, forested, desert, or a combination? Describe the horizon and how land meets sky. Detail the vegetation, noting types of trees, flowers, or grass, and how they're distributed across the landscape. Include information about any water features - rivers, lakes, oceans - and how they interact with the land. Describe the sky, including cloud formations, color gradients, and any celestial bodies visible. Pay attention to the quality of light, time of day, and season, explaining how these factors affect the colors and shadows in the scene. Include details about weather conditions and how they impact the landscape. Mention any geological features like rock formations, cliffs, or unique land patterns. If there are any distant man-made elements, describe how they integrate with the natural setting. Your description should capture the grandeur and mood of the landscape, allowing the reader to feel as if they're standing within this awe-inspiring natural scene. Your output is only the caption itself, no comments or extra formatting. The caption is in a single long paragraph."""
54
-
55
- fantasy_prompt = """Craft an extraordinarily detailed and imaginative description of a fantasy scene, blending elements of magic, otherworldly creatures, and fantastical environments. Begin by setting the overall tone - is this a dark and foreboding realm, a whimsical fairytale setting, or an epic high-fantasy world? Describe the landscape, including any impossible or magical geographical features like floating islands, crystal forests, or rivers of starlight. Detail the flora and fauna, focusing on fantastical plants and creatures that don't exist in our world. Include descriptions of any structures or ruins, emphasizing their otherworldly architecture and magical properties. Describe the sky and any celestial bodies, considering how they might differ from our reality. Include details about the presence of magic - how it manifests visually, its effects on the environment, and any magical phenomena occurring in the scene. If there are characters present, describe their appearance, focusing on non-human features, magical auras, or fantastical clothing and accessories. Pay attention to colors, textures, and light sources, especially those that couldn't exist in the real world. Your description should transport the reader to a realm of pure imagination, where the laws of physics and nature as we know them don't apply. Your output is only the caption itself, no comments or extra formatting. The caption is in a single long paragraph."""
56
-
57
- prompt_types = {
58
- "happy": default_happy_prompt,
59
- "simple": default_simple_prompt,
60
- "poster": poster_prompt,
61
- "only_objects": only_objects_prompt,
62
- "no_figure": no_figure_prompt,
63
- "landscape": landscape_prompt,
64
- "fantasy": fantasy_prompt,
65
- }
66
-
67
- # Update this part to handle the prompt_type correctly
68
- print(f"Received prompt_type: '{prompt_type}'") # Debug print
69
- if prompt_type and prompt_type.strip() and prompt_type in prompt_types:
70
- base_prompt = prompt_types[prompt_type]
71
- print(f"Using {prompt_type} prompt")
72
- elif custom_base_prompt.strip():
73
- base_prompt = custom_base_prompt
74
- print("Using custom base prompt")
75
- else:
76
- base_prompt = default_happy_prompt
77
- print(
78
- f"Warning: Unknown or empty prompt type '{prompt_type}'. Using default happy prompt."
79
- )
80
-
81
- if compress and not poster:
82
- compression_chars = {
83
- "soft": 600 if happy_talk else 300,
84
- "medium": 400 if happy_talk else 200,
85
- "hard": 200 if happy_talk else 100,
86
- }
87
- char_limit = compression_chars[compression_level]
88
- base_prompt += f" Compress the output to be concise while retaining key visual details. MAX OUTPUT SIZE no more than {char_limit} characters."
89
-
90
- system_message = "You are a helpful assistant. Try your best to give the best response possible to the user."
91
- user_message = f"{base_prompt}\nDescription: {input_text}"
92
-
93
- if provider == "Hugging Face":
94
- response = self.huggingface_client.chat.completions.create(
95
- model=model or "meta-llama/Meta-Llama-3.1-70B-Instruct",
96
- max_tokens=1024,
97
- temperature=0.7,
98
- top_p=0.95,
99
- messages=[
100
- {"role": "system", "content": system_message},
101
- {"role": "user", "content": user_message},
102
- ],
103
- )
104
- output = response.choices[0].message.content.strip()
105
-
106
- elif provider == "OpenAI":
107
- openai_client = OpenAI(api_key=api_key)
108
- response = openai_client.chat.completions.create(
109
- model=model or "gpt-4",
110
- max_tokens=1024,
111
- temperature=0.7,
112
- messages=[
113
- {"role": "system", "content": system_message},
114
- {"role": "user", "content": user_message},
115
- ],
116
- )
117
- output = response.choices[0].message.content.strip()
118
-
119
- elif provider == "Anthropic":
120
- anthropic_client = anthropic.Anthropic(api_key=api_key)
121
- response = anthropic_client.messages.create(
122
- model=model or "claude-3-5-sonnet-20240620",
123
- max_tokens=1024,
124
- temperature=0.7,
125
- system=system_message,
126
- messages=[
127
- {
128
- "role": "user",
129
- "content": [{"type": "text", "text": user_message}],
130
- }
131
- ],
132
- )
133
- output = response.content[0].text
134
-
135
- elif provider == "Groq":
136
- response = self.groq_client.chat.completions.create(
137
- model=model or "llama-3.1-70b-versatile",
138
- max_tokens=1024,
139
- temperature=0.7,
140
- messages=[
141
- {"role": "system", "content": system_message},
142
- {"role": "user", "content": user_message},
143
- ],
144
- )
145
- output = response.choices[0].message.content.strip()
146
-
147
- else:
148
- raise ValueError(f"Unsupported provider: {provider}")
149
-
150
- # Clean up the output
151
- if ": " in output:
152
- output = output.split(": ", 1)[1].strip()
153
- elif output.lower().startswith("here"):
154
- sentences = output.split(". ")
155
- if len(sentences) > 1:
156
- output = ". ".join(sentences[1:]).strip()
157
-
158
- return output
159
-
160
- except Exception as e:
161
- print(f"An error occurred: {e}")
162
- return f"Error occurred while processing the request: {str(e)}"
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ from datetime import datetime
4
+
5
+ import anthropic
6
+ from groq import Groq
7
+ from openai import OpenAI
8
+
9
+ huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
10
+ groq_api_key = os.getenv("GROQ_API_KEY")
11
+
12
+
13
+ class LLMInferenceNode:
14
+ def __init__(self):
15
+ try:
16
+ self.huggingface_client = OpenAI(
17
+ base_url="https://api-inference.huggingface.co/v1/",
18
+ api_key=huggingface_token,
19
+ )
20
+ except Exception:
21
+ print("Error occurred while initializing OpenAI client.")
22
+ self.huggingface_client = None
23
+ try:
24
+ self.groq_client = Groq(api_key=groq_api_key)
25
+ except Exception:
26
+ print("Error occurred while initializing Groq client.")
27
+ self.groq_client = None
28
+
29
+ def generate(
30
+ self,
31
+ input_text,
32
+ happy_talk,
33
+ compress,
34
+ compression_level,
35
+ poster,
36
+ prompt_type,
37
+ custom_base_prompt="",
38
+ provider="Hugging Face",
39
+ api_key=None,
40
+ model=None,
41
+ ):
42
+ try:
43
+ default_happy_prompt = """Create a detailed visually descriptive caption of this description, which will be used as a prompt for a text to image AI system (caption only, no instructions like "create an image").Remove any mention of digital artwork or artwork style. Give detailed visual descriptions of the character(s), including ethnicity, skin tone, expression etc. Imagine using keywords for a still for someone who has aphantasia. Describe the image style, e.g. any photographic or art styles / techniques utilized. Make sure to fully describe all aspects of the cinematography, with abundant technical details and visual descriptions. If there is more than one image, combine the elements and characters from all of the images creatively into a single cohesive composition with a single background, inventing an interaction between the characters. Be creative in combining the characters into a single cohesive scene. Focus on two primary characters (or one) and describe an interesting interaction between them, such as a hug, a kiss, a fight, giving an object, an emotional reaction / interaction. If there is more than one background in the images, pick the most appropriate one. Your output is only the caption itself, no comments or extra formatting. The caption is in a single long paragraph. If you feel the images are inappropriate, invent a new scene / characters inspired by these. Additionally, incorporate a specific movie director's visual style and describe the lighting setup in detail, including the type, color, and placement of light sources to create the desired mood and atmosphere. Always frame the scene, including details about the film grain, color grading, and any artifacts or characteristics specific."""
44
+
45
+ default_simple_prompt = """Create a brief, straightforward caption for this description, suitable for a text-to-image AI system. Focus on the main elements, key characters, and overall scene without elaborate details. Provide a clear and concise description in one or two sentences. Your output is only the caption itself, no comments or extra formatting. The caption is in a single long paragraph."""
46
+
47
+ poster_prompt = """Analyze the provided description and extract key information to create a movie poster style description. Format the output as follows:
48
+ Title: A catchy, intriguing title that captures the essence of the scene, place the title in "".
49
+ Main character: Give a description of the main character.
50
+ Background: Describe the background in detail.
51
+ Supporting characters: Describe the supporting characters
52
+ Branding type: Describe the branding type
53
+ Tagline: Include a tagline that captures the essence of the movie.
54
+ Visual style: Ensure that the visual style fits the branding type and tagline.
55
+ You are allowed to make up film and branding names, and do them like 80's, 90's or modern movie posters.Your output is only the caption itself, no comments or extra formatting. The caption is in a single long paragraph."""
56
+
57
+ only_objects_prompt = """Create a highly detailed and visually rich description focusing solely on inanimate objects, without including any human or animal figures. Describe the objects' shapes, sizes, colors, textures, and materials in great detail. Pay attention to their arrangement, positioning, and how they interact with light and shadow. Include information about the setting or environment these objects are in, such as indoor/outdoor, time of day, weather conditions, and any atmospheric effects. Mention any unique features, patterns, or imperfections on the objects. Describe the overall composition, perspective, and any artistic techniques that might be employed to render these objects (e.g., photorealism, impressionistic style, etc.). Your description should paint a vivid picture that allows someone to imagine the scene without seeing it, focusing on the beauty, complexity, or significance of everyday objects. Your output is only the caption itself, no comments or extra formatting. The caption is in a single long paragraph."""
58
+
59
+ no_figure_prompt = """Generate a comprehensive and visually evocative description of a scene or landscape without including any human or animal figures. Focus on the environment, natural elements, and man-made structures if present. Describe the topography, vegetation, weather conditions, and time of day in great detail. Pay attention to colors, textures, and how light interacts with different elements of the scene. If there are buildings or other structures, describe their architecture, condition, and how they fit into the landscape. Include sensory details beyond just visual elements - mention sounds, smells, and the overall atmosphere or mood of the scene. Describe any notable features like bodies of water, geological formations, or sky phenomena. Consider the perspective from which the scene is viewed and how this affects the composition. Your description should transport the reader to this location, allowing them to vividly imagine the scene without any living subjects present. Your output is only the caption itself, no comments or extra formatting. The caption is in a single long paragraph."""
60
+
61
+ landscape_prompt = """Create an immersive and detailed description of a landscape, focusing on its natural beauty and geographical features. Begin with the overall topography - is it mountainous, coastal, forested, desert, or a combination? Describe the horizon and how land meets sky. Detail the vegetation, noting types of trees, flowers, or grass, and how they're distributed across the landscape. Include information about any water features - rivers, lakes, oceans - and how they interact with the land. Describe the sky, including cloud formations, color gradients, and any celestial bodies visible. Pay attention to the quality of light, time of day, and season, explaining how these factors affect the colors and shadows in the scene. Include details about weather conditions and how they impact the landscape. Mention any geological features like rock formations, cliffs, or unique land patterns. If there are any distant man-made elements, describe how they integrate with the natural setting. Your description should capture the grandeur and mood of the landscape, allowing the reader to feel as if they're standing within this awe-inspiring natural scene. Your output is only the caption itself, no comments or extra formatting. The caption is in a single long paragraph."""
62
+
63
+ fantasy_prompt = """Craft an extraordinarily detailed and imaginative description of a fantasy scene, blending elements of magic, otherworldly creatures, and fantastical environments. Begin by setting the overall tone - is this a dark and foreboding realm, a whimsical fairytale setting, or an epic high-fantasy world? Describe the landscape, including any impossible or magical geographical features like floating islands, crystal forests, or rivers of starlight. Detail the flora and fauna, focusing on fantastical plants and creatures that don't exist in our world. Include descriptions of any structures or ruins, emphasizing their otherworldly architecture and magical properties. Describe the sky and any celestial bodies, considering how they might differ from our reality. Include details about the presence of magic - how it manifests visually, its effects on the environment, and any magical phenomena occurring in the scene. If there are characters present, describe their appearance, focusing on non-human features, magical auras, or fantastical clothing and accessories. Pay attention to colors, textures, and light sources, especially those that couldn't exist in the real world. Your description should transport the reader to a realm of pure imagination, where the laws of physics and nature as we know them don't apply. Your output is only the caption itself, no comments or extra formatting. The caption is in a single long paragraph."""
64
+
65
+ prompt_types = {
66
+ "happy": default_happy_prompt,
67
+ "simple": default_simple_prompt,
68
+ "poster": poster_prompt,
69
+ "only_objects": only_objects_prompt,
70
+ "no_figure": no_figure_prompt,
71
+ "landscape": landscape_prompt,
72
+ "fantasy": fantasy_prompt,
73
+ }
74
+
75
+ # Update this part to handle the prompt_type correctly
76
+ print(f"Received prompt_type: '{prompt_type}'") # Debug print
77
+ if prompt_type and prompt_type.strip() and prompt_type in prompt_types:
78
+ base_prompt = prompt_types[prompt_type]
79
+ print(f"Using {prompt_type} prompt")
80
+ elif custom_base_prompt.strip():
81
+ base_prompt = custom_base_prompt
82
+ print("Using custom base prompt")
83
+ else:
84
+ base_prompt = default_happy_prompt
85
+ print(
86
+ f"Warning: Unknown or empty prompt type '{prompt_type}'. Using default happy prompt."
87
+ )
88
+
89
+ if compress and not poster:
90
+ compression_chars = {
91
+ "soft": 600 if happy_talk else 300,
92
+ "medium": 400 if happy_talk else 200,
93
+ "hard": 200 if happy_talk else 100,
94
+ }
95
+ char_limit = compression_chars[compression_level]
96
+ base_prompt += f" Compress the output to be concise while retaining key visual details. MAX OUTPUT SIZE no more than {char_limit} characters."
97
+
98
+ system_message = "You are a helpful assistant. Try your best to give the best response possible to the user."
99
+ user_message = f"{base_prompt}\nDescription: {input_text}"
100
+
101
+ if provider == "Hugging Face" and self.huggingface_client is not None:
102
+ response = self.huggingface_client.chat.completions.create(
103
+ model=model or "meta-llama/Meta-Llama-3.1-70B-Instruct",
104
+ max_tokens=1024,
105
+ temperature=0.7,
106
+ top_p=0.95,
107
+ messages=[
108
+ {"role": "system", "content": system_message},
109
+ {"role": "user", "content": user_message},
110
+ ],
111
+ )
112
+ output = response.choices[0].message.content.strip()
113
+
114
+ elif provider == "OpenAI":
115
+ openai_client = OpenAI(api_key=api_key)
116
+ response = openai_client.chat.completions.create(
117
+ model=model or "gpt-4",
118
+ max_tokens=1024,
119
+ temperature=0.7,
120
+ messages=[
121
+ {"role": "system", "content": system_message},
122
+ {"role": "user", "content": user_message},
123
+ ],
124
+ )
125
+ output = response.choices[0].message.content.strip()
126
+
127
+ elif provider == "Anthropic":
128
+ anthropic_client = anthropic.Anthropic(api_key=api_key)
129
+ response = anthropic_client.messages.create(
130
+ model=model or "claude-3-5-sonnet-20240620",
131
+ max_tokens=1024,
132
+ temperature=0.7,
133
+ system=system_message,
134
+ messages=[
135
+ {
136
+ "role": "user",
137
+ "content": [{"type": "text", "text": user_message}],
138
+ }
139
+ ],
140
+ )
141
+ output = response.content[0].text
142
+
143
+ elif provider == "Groq" and self.groq_client is not None:
144
+ response = self.groq_client.chat.completions.create(
145
+ model=model or "llama-3.1-70b-versatile",
146
+ max_tokens=1024,
147
+ temperature=0.7,
148
+ messages=[
149
+ {"role": "system", "content": system_message},
150
+ {"role": "user", "content": user_message},
151
+ ],
152
+ )
153
+ output = response.choices[0].message.content.strip()
154
+
155
+ else:
156
+ raise ValueError(f"Unsupported provider: {provider}")
157
+
158
+ # Clean up the output
159
+ if ": " in output:
160
+ output = output.split(": ", 1)[1].strip()
161
+ elif output.lower().startswith("here"):
162
+ sentences = output.split(". ")
163
+ if len(sentences) > 1:
164
+ output = ". ".join(sentences[1:]).strip()
165
+
166
+ return output
167
+
168
+ except Exception as e:
169
+ print(f"An error occurred: {e}")
170
+ return f"Error occurred while processing the request: {str(e)}"