Spaces:

sagar007
/

Stable_diffusion_Art

Runtime error

App Files Files Community

sagar007 commited on Aug 3, 2024

Commit

44fe76b

verified ·

1 Parent(s): bc1fb47

Update app.py

Browse files

Files changed (1) hide show

app.py +75 -201

app.py CHANGED Viewed

@@ -1,29 +1,25 @@
-import gradio as gr
-import PIL
 import torch
-import numpy as np
-from PIL import Image
 from tqdm import tqdm
 import torch.nn.functional as F
-import torchvision.transforms as T
-from diffusers import LMSDiscreteScheduler, DiffusionPipeline
-# configurations
-torch_device        = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
-height, width       = 512,512
-guidance_scale      = 8
-loss_scale          = 200
-num_inference_steps = 50
 model_path = "CompVis/stable-diffusion-v1-4"
 sd_pipeline = DiffusionPipeline.from_pretrained(
     model_path,
-    low_cpu_mem_usage = True,
     torch_dtype=torch.float32
 ).to(torch_device)
 sd_pipeline.load_textual_inversion("sd-concepts-library/illustration-style")
 sd_pipeline.load_textual_inversion("sd-concepts-library/line-art")
 sd_pipeline.load_textual_inversion("sd-concepts-library/hitokomoru-style-nao")
@@ -32,199 +28,77 @@ sd_pipeline.load_textual_inversion("sd-concepts-library/midjourney-style")
 sd_pipeline.load_textual_inversion("sd-concepts-library/hanfu-anime-style")
 sd_pipeline.load_textual_inversion("sd-concepts-library/birb-style")
-styles_mapping = {
-    "Illustration Style": '<illustration-style>', "Line Art":'<line-art>',
-    "Hitokomoru Style":'<hitokomoru-style-nao>', "Marc Allante": '<Marc_Allante>',
-    "Midjourney":'<midjourney-style>', "Hanfu Anime": '<hanfu-anime-style>',
     "Birb Style": '<birb-style>'
 }
-# Define seeds for all the styles
-seed_list = [11, 56, 110, 65, 5, 29, 47]
-# Loss Function based on Edge Detection
-def edge_detection(image):
-    channels = image.shape[1]
-    # Define the kernels for Edge Detection
-    ed_x = torch.tensor([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]], dtype=torch.float32).unsqueeze(0).unsqueeze(0)
-    ed_y = torch.tensor([[1, 2, 1], [0, 0, 0], [-1, -2, -1]], dtype=torch.float32).unsqueeze(0).unsqueeze(0)
-    # Replicate the Edge detection kernels for each channel
-    ed_x = ed_x.repeat(channels, 1, 1, 1).to(image.device)
-    ed_y = ed_y.repeat(channels, 1, 1, 1).to(image.device)
-    # ed_x = ed_x.to(torch.float16)
-    # ed_y = ed_y.to(torch.float16)
-    # Convolve the image with the Edge detection kernels
-    conv_ed_x = F.conv2d(image, ed_x, padding=1, groups=channels)
-    conv_ed_y = F.conv2d(image, ed_y, padding=1, groups=channels)
-    # Combine the x and y gradients after convolution
-    ed_value = torch.sqrt(conv_ed_x**2 + conv_ed_y**2)
-    return ed_value
-def edge_loss(image):
-    ed_value = edge_detection(image)
-    ed_capped = (ed_value > 0.5).to(torch.float32)
-    return F.mse_loss(ed_value, ed_capped)
-def compute_loss(original_image, loss_type):
-    if loss_type == 'blue':
-        # blue loss
-        # [:,2] -> all images in batch, only the blue channel
-        error = torch.abs(original_image[:,2] - 0.9).mean()
-    elif loss_type == 'edge':
-        # edge loss
-        error = edge_loss(original_image)
-    elif loss_type == 'contrast':
-        # RGB to Gray loss
-        transformed_image = T.functional.adjust_contrast(original_image, contrast_factor = 2)
-        error = torch.abs(transformed_image - original_image).mean()
-    elif loss_type == 'brightness':
-        # brightnesss loss
-        transformed_image = T.functional.adjust_brightness(original_image, brightness_factor = 2)
-        error = torch.abs(transformed_image - original_image).mean()
-    elif loss_type == 'sharpness':
-        # sharpness loss
-        transformed_image = T.functional.adjust_sharpness(original_image, sharpness_factor = 2)
-        error = torch.abs(transformed_image - original_image).mean()
-    elif loss_type == 'saturation':
-        # saturation loss
-        transformed_image = T.functional.adjust_saturation(original_image, saturation_factor = 10)
-        error = torch.abs(transformed_image - original_image).mean()
-    else:
-        print("error. Loss not defined")
-    return error
-def get_examples():
-   examples = [
-      ['A bird sitting on a tree', 'Midjourney', 'edge']
-   ]
-   return examples
-# Existing functions (latents_to_pil, show_image, generate_image)
-# ... (Copy all the existing functions here)
-def latents_to_pil(latents):
-    # bath of latents -> list of images
-    latents = (1 / 0.18215) * latents
-    with torch.no_grad():
-        image = sd_pipeline.vae.decode(latents).sample
-    image = (image / 2 + 0.5).clamp(0, 1) # 0 to 1
-    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
-    image = (image * 255).round().astype("uint8")
-    return Image.fromarray(image[0])
-def show_image(prompt, concept, guidance_type):
-  for idx, sd in enumerate(styles_mapping.keys()):
-    if(sd == concept):
-      break
-  seed =  seed_list[idx]
-  prompt = f"{prompt} in the style of {styles_mapping[sd]}"
-  styled_image_without_loss = latents_to_pil(generate_image(seed, prompt, guidance_type, loss_flag=False))
-  styled_image_with_loss = latents_to_pil(generate_image(seed, prompt, guidance_type, loss_flag=True))
-  return([styled_image_without_loss, styled_image_with_loss])
-def generate_image(seed, prompt, loss_type, loss_flag=False):
-    generator           = torch.manual_seed(seed)
-    batch_size          = 1
-    # scheduler
-    scheduler    = LMSDiscreteScheduler(beta_start = 0.00085, beta_end = 0.012, beta_schedule = "scaled_linear", num_train_timesteps = 1000)
     scheduler.set_timesteps(num_inference_steps)
     scheduler.timesteps = scheduler.timesteps.to(torch.float32)
-    # text embeddings of the prompt
-    text_input = sd_pipeline.tokenizer(prompt, padding='max_length', max_length = sd_pipeline.tokenizer.model_max_length, truncation= True, return_tensors="pt")
-    input_ids = text_input.input_ids.to(torch_device)
     with torch.no_grad():
-        text_embeddings = sd_pipeline.text_encoder(text_input.input_ids.to(torch_device))[0]
-    max_length = text_input.input_ids.shape[-1]
-    uncond_input = sd_pipeline.tokenizer(
-          [""] * batch_size, padding="max_length", max_length= max_length, return_tensors="pt"
-    )
     with torch.no_grad():
-        uncond_embeddings = sd_pipeline.text_encoder(uncond_input.input_ids.to(torch_device))[0]
-    text_embeddings = torch.cat([uncond_embeddings,text_embeddings]) # shape: 2,77,768
-    # random latent
-    latents = torch.randn(
-        (batch_size, sd_pipeline.unet.config.in_channels, height// 8, width //8),
-        generator = generator,
-    ) .to(torch.float32)
-    latents = latents.to(torch_device)
-    latents = latents * scheduler.init_noise_sigma
-    for i, t in tqdm(enumerate(scheduler.timesteps), total = len(scheduler.timesteps)):
-        latent_model_input = torch.cat([latents] * 2)
-        sigma = scheduler.sigmas[i]
-        latent_model_input = scheduler.scale_model_input(latent_model_input, t)
-        with torch.no_grad():
-            noise_pred = sd_pipeline.unet(latent_model_input.to(torch.float32), t, encoder_hidden_states=text_embeddings)["sample"]
-        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-        if loss_flag and i%5 == 0:
-            latents = latents.detach().requires_grad_()
-            # the following line alone does not work, it requires change to reduce step only once
-            # hence commenting it out
-            #latents_x0 = scheduler.step(noise_pred,t, latents).pred_original_sample
-            latents_x0 = latents - sigma * noise_pred
-            # use vae to decode the image
-            denoised_images = sd_pipeline.vae.decode((1/ 0.18215) * latents_x0).sample / 2 + 0.5 # range(0,1)
-            loss = compute_loss(denoised_images, loss_type) * loss_scale
-            #loss = loss.to(torch.float16)
-            print(f"{i} loss {loss}")
-            cond_grad = torch.autograd.grad(loss, latents)[0]
-            latents = latents.detach() - cond_grad * sigma**2
-        latents = scheduler.step(noise_pred,t, latents).prev_sample
-    return latents
-# Gradio interface function
-def generate_images(prompt, style, guidance_type):
-    images = show_image(prompt, style, guidance_type)
-    return images[0], images[1]
-# Create Gradio interface
-iface = gr.Interface(
-    fn=generate_images,
-    inputs=[
-        gr.Textbox(label="Prompt"),
-        gr.Dropdown(list(styles_mapping.keys()), label="Style"),
-        gr.Dropdown(["blue", "edge", "contrast", "brightness", "sharpness", "saturation"], label="Guidance Type"),
-    ],
-    outputs=[
-        gr.Image(label="Image without Loss"),
-        gr.Image(label="Image with Loss"),
-    ],
-    examples=get_examples(),
-    title="Text Inversion Image Generation",
-    description="Generate images using text inversion with different styles and guidance types.",
-)
-# Launch the app
-iface.launch()

+import os
 import torch
+import gradio as gr
 from tqdm import tqdm
+from PIL import Image
 import torch.nn.functional as F
+from torchvision import transforms as tfms
+from transformers import CLIPTextModel, CLIPTokenizer, logging
+from diffusers import AutoencoderKL, LMSDiscreteScheduler, UNet2DConditionModel, DiffusionPipeline
+torch_device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
+if "mps" == torch_device: os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = "1"
+# Load the pipeline
 model_path = "CompVis/stable-diffusion-v1-4"
 sd_pipeline = DiffusionPipeline.from_pretrained(
     model_path,
+    low_cpu_mem_usage=True,
     torch_dtype=torch.float32
 ).to(torch_device)
+# Load textual inversions
 sd_pipeline.load_textual_inversion("sd-concepts-library/illustration-style")
 sd_pipeline.load_textual_inversion("sd-concepts-library/line-art")
 sd_pipeline.load_textual_inversion("sd-concepts-library/hitokomoru-style-nao")
 sd_pipeline.load_textual_inversion("sd-concepts-library/hanfu-anime-style")
 sd_pipeline.load_textual_inversion("sd-concepts-library/birb-style")
+# Update style token dictionary
+style_token_dict = {
+    "Illustration Style": '<illustration-style>',
+    "Line Art":'<line-art>',
+    "Hitokomoru Style":'<hitokomoru-style-nao>',
+    "Marc Allante": '<Marc_Allante>',
+    "Midjourney":'<midjourney-style>',
+    "Hanfu Anime": '<hanfu-anime-style>',
     "Birb Style": '<birb-style>'
 }
+def set_timesteps(scheduler, num_inference_steps):
     scheduler.set_timesteps(num_inference_steps)
     scheduler.timesteps = scheduler.timesteps.to(torch.float32)
+def pil_to_latent(input_im):
     with torch.no_grad():
+        latent = vae.encode(tfms.ToTensor()(input_im).unsqueeze(0).to(torch_device)*2-1) # Note scaling
+    return 0.18215 * latent.latent_dist.sample()
+def latents_to_pil(latents):
+    latents = (1 / 0.18215) * latents
     with torch.no_grad():
+        image = vae.decode(latents).sample
+    image = (image / 2 + 0.5).clamp(0, 1)
+    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
+    images = (image * 255).round().astype("uint8")
+    pil_images = [Image.fromarray(image) for image in images]
+    return pil_images
+def generate_with_pipeline(prompt, num_inference_steps, guidance_scale, seed):
+    generator = torch.Generator(device=torch_device).manual_seed(seed)
+    image = sd_pipeline(
+        prompt,
+        num_inference_steps=num_inference_steps,
+        guidance_scale=guidance_scale,
+        generator=generator
+    ).images[0]
+    return image
+def inference(text, style, inference_step, guidance_scale, seed, guidance_method, loss_scale):
+    prompt = text + " " + style_token_dict[style]
+    # Generate image with pipeline
+    image_pipeline = generate_with_pipeline(prompt, inference_step, guidance_scale, seed)
+    # For the guided image, we'll need to implement a custom pipeline or modify the existing one
+    # This is a placeholder and would need to be implemented
+    image_guide = image_pipeline  # This should be replaced with actual guided generation
+    return image_pipeline, image_guide
+title = "Stable Diffusion with Textual Inversion"
+description = "A simple Gradio interface to infer Stable Diffusion and generate images with different art styles"
+examples = [["A sweet potato farm", 'Illustration Style', 10, 4.5, 1, 'Grayscale', 100],
+            ["Sky full of cotton candy", 'Line Art', 10, 9.5, 2, 'Bright', 200]]
+demo = gr.Interface(inference,
+                    inputs = [gr.Textbox(label="Prompt", type="text"),
+                              gr.Dropdown(label="Style", choices=list(style_token_dict.keys()), value="Illustration Style"),
+                              gr.Slider(10, 30, 10, step = 1, label="Inference steps"),
+                              gr.Slider(1, 10, 7.5, step = 0.1, label="Guidance scale"),
+                              gr.Slider(0, 10000, 1, step = 1, label="Seed"),
+                              gr.Dropdown(label="Guidance method", choices=['Grayscale', 'Bright', 'Contrast',
+                                                                  'Symmetry', 'Saturation'], value="Grayscale"),
+                              gr.Slider(100, 10000, 200, step = 100, label="Loss scale")],
+                    outputs= [gr.Image(width=320, height=320, label="Generated art"),
+                              gr.Image(width=320, height=320, label="Generated art with guidance")],
+                    title=title,
+                    description=description,
+                    examples=examples)
+demo.launch()