DeepseekJanusPro-Image

Running on Zero

App Files Files Community

Bils commited on 14 days ago

Commit

3dc7f18

verified ·

1 Parent(s): 601ca08

Update app.py

Browse files

Files changed (1) hide show

app.py +209 -115

app.py CHANGED Viewed

@@ -4,144 +4,238 @@ from transformers import AutoConfig, AutoModelForCausalLM
 from janus.models import MultiModalityCausalLM, VLChatProcessor
 from PIL import Image
 import numpy as np
-import spaces
-# Load the model and processor
-model_path = "deepseek-ai/Janus-Pro-7B"
-config = AutoConfig.from_pretrained(model_path)
-language_config = config.language_config
-language_config._attn_implementation = 'eager'
-vl_gpt = AutoModelForCausalLM.from_pretrained(
-    model_path,
-    language_config=language_config,
-    trust_remote_code=True
-)
-vl_gpt = vl_gpt.to(torch.bfloat16).cuda() if torch.cuda.is_available() else vl_gpt.to(torch.float16)
-vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
-tokenizer = vl_chat_processor.tokenizer
-cuda_device = 'cuda' if torch.cuda.is_available() else 'cpu'
-# Helper functions
-def generate(input_ids, width, height, cfg_weight=5, temperature=1.0, parallel_size=5, patch_size=16):
-    torch.cuda.empty_cache()
-    tokens = torch.zeros((parallel_size * 2, len(input_ids)), dtype=torch.int).to(cuda_device)
-    for i in range(parallel_size * 2):
-        tokens[i, :] = input_ids
-        if i % 2 != 0:
-            tokens[i, 1:-1] = vl_chat_processor.pad_id
-    inputs_embeds = vl_gpt.language_model.get_input_embeddings()(tokens)
-    generated_tokens = torch.zeros((parallel_size, 576), dtype=torch.int).to(cuda_device)
-    pkv = None
-    for i in range(576):
         with torch.no_grad():
-            outputs = vl_gpt.language_model.model(inputs_embeds=inputs_embeds, use_cache=True, past_key_values=pkv)
-            pkv = outputs.past_key_values
-            hidden_states = outputs.last_hidden_state
-            logits = vl_gpt.gen_head(hidden_states[:, -1, :])
-            logit_cond = logits[0::2, :]
-            logit_uncond = logits[1::2, :]
-            logits = logit_uncond + cfg_weight * (logit_cond - logit_uncond)
-            probs = torch.softmax(logits / temperature, dim=-1)
-            next_token = torch.multinomial(probs, num_samples=1)
-            generated_tokens[:, i] = next_token.squeeze(dim=-1)
-            next_token = torch.cat([next_token.unsqueeze(dim=1)] * 2, dim=1).view(-1)
-            img_embeds = vl_gpt.prepare_gen_img_embeds(next_token)
-            inputs_embeds = img_embeds.unsqueeze(dim=1)
-    patches = vl_gpt.gen_vision_model.decode_code(
-        generated_tokens.to(dtype=torch.int),
-        shape=[parallel_size, 8, width // patch_size, height // patch_size]
-    )
-    return patches
 def unpack(patches, width, height, parallel_size=5):
-    # Detach the tensor before converting to numpy
-    patches = patches.detach().to(torch.float32).cpu().numpy().transpose(0, 2, 3, 1)
-    patches = np.clip((patches + 1) / 2 * 255, 0, 255)
-    images = [Image.fromarray(patches[i].astype(np.uint8)) for i in range(parallel_size)]
-    return images
 @torch.inference_mode()
 @spaces.GPU(duration=120)
-def generate_image(prompt, seed=None, guidance=5, t2i_temperature=1.0):
-    torch.cuda.empty_cache()
-    if seed is not None:
         torch.manual_seed(seed)
-        torch.cuda.manual_seed(seed)
-        np.random.seed(seed)
-    width, height, parallel_size = 384, 384, 5
-    messages = [
-        {'role': '<|User|>', 'content': prompt},
-        {'role': '<|Assistant|>', 'content': ''}
-    ]
-    text = vl_chat_processor.apply_sft_template_for_multi_turn_prompts(
-        conversations=messages, sft_format=vl_chat_processor.sft_format, system_prompt=''
-    )
-    text += vl_chat_processor.image_start_tag
-    input_ids = torch.LongTensor(tokenizer.encode(text))
-    patches = generate(input_ids, width, height, cfg_weight=guidance, temperature=t2i_temperature, parallel_size=parallel_size)
-    return unpack(patches, width, height, parallel_size)
-# Gradio interface
 def create_interface():
-    with gr.Blocks() as demo:
         gr.Markdown("""
         # Text-to-Image Generation with Janus-Pro-7B
-        Welcome to the Janus-Pro-7B Text-to-Image Generator! This advanced AI model by DeepSeek offers state-of-the-art capabilities in generating images from textual descriptions. Leveraging a unified multimodal framework, Janus-Pro-7B excels in both understanding and generating content, providing detailed and accurate visual representations based on your prompts.
-        **Key Features:**
-        - **High-Quality Image Generation:** Produces stable and detailed images that often surpass those from other leading models.
-        For more information about Janus-Pro-7B, visit the [official Hugging Face model page](https://huggingface.co/deepseek-ai/Janus-Pro-7B).
         """)
-        prompt_input = gr.Textbox(label="Prompt (describe the image)")
-        # Option to toggle additional parameters
-        with gr.Accordion("Advanced Parameters", open=False):
-            seed_input = gr.Number(label="Seed (Optional)", value=12345, precision=0)
-            guidance_slider = gr.Slider(label="CFG Guidance Weight", minimum=1, maximum=10, value=5, step=0.5)
-            temperature_slider = gr.Slider(label="Temperature", minimum=0, maximum=1, value=1.0, step=0.05)
-        generate_button = gr.Button("Generate Images")
-        output_gallery = gr.Gallery(label="Generated Images", columns=2, height=300)
-        generate_button.click(
-            generate_image,
-            inputs=[prompt_input, seed_input, guidance_slider, temperature_slider],
-            outputs=output_gallery
         )
-        # Footer
         gr.Markdown("""
-        <hr>
-        <p style="text-align: center; font-size: 0.9em;">
-            Created with ❤️ by <a href="https://bilsimaging.com" target="_blank">bilsimaging.com</a>
-        </p>
         """)
         # Visitor Badge
         gr.HTML("""
-        <a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FDeepseekJanusPro%2F"><img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FDeepseekJanusPro%2F&countColor=%23263759" /></a>
         """)
     return demo
-demo = create_interface()
-demo.launch(share=True)

 from janus.models import MultiModalityCausalLM, VLChatProcessor
 from PIL import Image
 import numpy as np
+import spaces
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Constants
+DEFAULT_WIDTH = 384
+DEFAULT_HEIGHT = 384
+PARALLEL_SIZE = 5
+PATCH_SIZE = 16
+# Load model and processor with error handling
+def load_model():
+    try:
+        model_path = "deepseek-ai/Janus-Pro-7B"
+        config = AutoConfig.from_pretrained(model_path)
+        language_config = config.language_config
+        language_config._attn_implementation = 'eager'
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        logger.info(f"Loading model on device: {device}")
+        vl_gpt = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            language_config=language_config,
+            trust_remote_code=True,
+            torch_dtype=torch.bfloat16 if device.type == "cuda" else torch.float32
+        ).to(device)
+        vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
+        return vl_gpt, vl_chat_processor, device
+    except Exception as e:
+        logger.error(f"Model loading failed: {str(e)}")
+        raise RuntimeError("Failed to load model. Please check the model path and dependencies.")
+try:
+    vl_gpt, vl_chat_processor, device = load_model()
+    tokenizer = vl_chat_processor.tokenizer
+except RuntimeError as e:
+    raise e
+# Helper functions with improved memory management
+def generate(input_ids, width, height, cfg_weight=5, temperature=1.0, parallel_size=5, progress=None):
+    try:
+        torch.cuda.empty_cache()
+        tokens = torch.zeros((parallel_size * 2, len(input_ids)), dtype=torch.int, device=device)
+        for i in range(parallel_size * 2):
+            tokens[i, :] = input_ids
+            if i % 2 != 0:
+                tokens[i, 1:-1] = vl_chat_processor.pad_id
         with torch.no_grad():
+            inputs_embeds = vl_gpt.language_model.get_input_embeddings()(tokens)
+            generated_tokens = torch.zeros((parallel_size, 576), dtype=torch.int, device=device)
+            pkv = None
+            for i in range(576):
+                if progress:
+                    progress((i + 1) / 576, desc="Generating image tokens")
+                outputs = vl_gpt.language_model.model(
+                    inputs_embeds=inputs_embeds,
+                    use_cache=True,
+                    past_key_values=pkv
+                )
+                pkv = outputs.past_key_values
+                hidden_states = outputs.last_hidden_state
+                logits = vl_gpt.gen_head(hidden_states[:, -1, :])
+                logit_cond = logits[0::2, :]
+                logit_uncond = logits[1::2, :]
+                logits = logit_uncond + cfg_weight * (logit_cond - logit_uncond)
+                probs = torch.softmax(logits / temperature, dim=-1)
+                next_token = torch.multinomial(probs, num_samples=1)
+                generated_tokens[:, i] = next_token.squeeze(dim=-1)
+                next_token = torch.cat([next_token.unsqueeze(dim=1)] * 2, dim=1).view(-1)
+                img_embeds = vl_gpt.prepare_gen_img_embeds(next_token)
+                inputs_embeds = img_embeds.unsqueeze(dim=1)
+        return generated_tokens
+    except RuntimeError as e:
+        logger.error(f"Generation error: {str(e)}")
+        raise RuntimeError("Generation failed due to memory constraints. Try reducing the parallel size.")
+    finally:
+        torch.cuda.empty_cache()
 def unpack(patches, width, height, parallel_size=5):
+    try:
+        patches = patches.detach().to(device='cpu', dtype=torch.float32).numpy()
+        patches = patches.transpose(0, 2, 3, 1)
+        patches = np.clip((patches + 1) / 2 * 255, 0, 255)
+        return [Image.fromarray(patch.astype(np.uint8)) for patch in patches]
+    except Exception as e:
+        logger.error(f"Unpacking error: {str(e)}")
+        raise RuntimeError("Failed to process generated image data.")
 @torch.inference_mode()
 @spaces.GPU(duration=120)
+def generate_image(prompt, seed=None, guidance=5, t2i_temperature=1.0, progress=gr.Progress()):
+    try:
+        if not prompt.strip():
+            raise gr.Error("Please enter a valid prompt.")
+        progress(0, desc="Initializing...")
+        torch.cuda.empty_cache()
+        # Seed management
+        if seed is None:
+            seed = torch.seed()
+        else:
+            seed = int(seed)
         torch.manual_seed(seed)
+        if device.type == "cuda":
+            torch.cuda.manual_seed(seed)
+        messages = [{'role': '<|User|>', 'content': prompt}, {'role': '<|Assistant|>', 'content': ''}]
+        text = vl_chat_processor.apply_sft_template_for_multi_turn_prompts(
+            conversations=messages,
+            sft_format=vl_chat_processor.sft_format,
+            system_prompt=''
+        ) + vl_chat_processor.image_start_tag
+        input_ids = torch.tensor(tokenizer.encode(text), dtype=torch.long, device=device)
+        progress(0.1, desc="Generating image tokens...")
+        generated_tokens = generate(
+            input_ids,
+            DEFAULT_WIDTH,
+            DEFAULT_HEIGHT,
+            cfg_weight=guidance,
+            temperature=t2i_temperature,
+            parallel_size=PARALLEL_SIZE,
+            progress=progress
+        )
+        progress(0.9, desc="Processing images...")
+        patches = vl_gpt.gen_vision_model.decode_code(
+            generated_tokens.to(dtype=torch.int),
+            shape=[PARALLEL_SIZE, 8, DEFAULT_WIDTH // PATCH_SIZE, DEFAULT_HEIGHT // PATCH_SIZE]
+        )
+        images = unpack(patches, DEFAULT_WIDTH, DEFAULT_HEIGHT, PARALLEL_SIZE)
+        return images
+    except Exception as e:
+        logger.error(f"Generation failed: {str(e)}")
+        raise gr.Error(f"Image generation failed: {str(e)}")
 def create_interface():
+    with gr.Blocks(title="Janus-Pro-7B Image Generator", theme=gr.themes.Soft()) as demo:
         gr.Markdown("""
         # Text-to-Image Generation with Janus-Pro-7B
+        **Generate high-quality images from text prompts using DeepSeek's advanced multimodal AI model.**
         """)
+        with gr.Row():
+            with gr.Column(scale=3):
+                prompt_input = gr.Textbox(label="Prompt", placeholder="Describe the image you want to generate...", lines=3)
+                generate_btn = gr.Button("Generate Images", variant="primary")
+                with gr.Accordion("Advanced Settings", open=False):
+                    with gr.Group():
+                        seed_input = gr.Number(label="Seed", value=None, precision=0, description="Leave empty for random seed")
+                        guidance_slider = gr.Slider(3, 10, value=5, step=0.5,
+                                                  label="CFG Guidance Weight",
+                                                  info="Higher values = more prompt adherence, lower values = more creativity")
+                        temp_slider = gr.Slider(0.1, 1.0, value=1.0, step=0.1,
+                                             label="Temperature",
+                                             info="Higher values = more randomness, lower values = more deterministic")
+            with gr.Column(scale=2):
+                output_gallery = gr.Gallery(label="Generated Images", columns=2, height=600, preview=True)
+                status = gr.Textbox(label="Status", interactive=False)
+        gr.Examples(
+            examples=[
+                ["A futuristic cityscape at sunset with flying cars and holographic advertisements"],
+                ["An astronaut riding a horse in photorealistic style"],
+                ["A cute robotic cat sitting on a stack of ancient books, digital art"]
+            ],
+            inputs=prompt_input
         )
         gr.Markdown("""
+        ## Model Information
+        - **Model:** [Janus-Pro-7B](https://huggingface.co/deepseek-ai/Janus-Pro-7B)
+        - **Output Resolution:** 384x384 pixels
+        - **Parallel Generation:** 5 images per request
         """)
+        # Footer Section
+        gr.Markdown("""
+        <hr style="margin-top: 2em; margin-bottom: 1em;">
+        <div style="text-align: center; color: #666; font-size: 0.9em;">
+            Created with ❤️ by <a href="https://bilsimaging.com" target="_blank" style="color: #2563eb; text-decoration: none;">bilsimaging.com</a>
+        </div>
+        """)
         # Visitor Badge
         gr.HTML("""
+        <div style="text-align: center; margin-top: 1em;">
+            <a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FDeepseekJanusPro%2F">
+                <img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FDeepseekJanusPro%2F&countColor=%23263759"
+                     alt="Visitor Badge"
+                     style="display: inline-block; margin: 0 auto;">
+            </a>
+        </div>
         """)
+        generate_btn.click(
+            generate_image,
+            inputs=[prompt_input, seed_input, guidance_slider, temp_slider],
+            outputs=output_gallery,
+            api_name="generate"
+        )
+        demo.load(
+            fn=lambda: f"Device Status: {'GPU ✅' if device.type == 'cuda' else 'CPU ⚠️'}",
+            outputs=status,
+            queue=False
+        )
     return demo
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch(share=True)