Spaces:
Runtime error
Runtime error
File size: 10,145 Bytes
6447c95 a324479 6447c95 a324479 6447c95 e6915e1 a324479 6447c95 a324479 6447c95 a324479 6447c95 a324479 169ec0c 892096a 6447c95 42d64c8 6447c95 6162149 6447c95 892096a 6f417f5 6447c95 6162149 6447c95 6f417f5 6447c95 6f417f5 6447c95 ddb9f2a 6447c95 6f417f5 6447c95 6f417f5 6447c95 169ec0c 6f417f5 a324479 6447c95 892096a e6915e1 6447c95 e6915e1 6447c95 e6915e1 6447c95 e6915e1 6447c95 a324479 e6915e1 6447c95 e6915e1 fef68a2 e6915e1 169ec0c 6f417f5 e6915e1 6447c95 169ec0c 6447c95 e6915e1 6447c95 892096a e6915e1 169ec0c e6915e1 169ec0c 892096a 169ec0c 6447c95 892096a 169ec0c 6f417f5 6447c95 169ec0c 6447c95 a324479 169ec0c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 |
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
from segment_anything import sam_model_registry, SamAutomaticMaskGenerator
from PIL import Image
import gradio as gr
import numpy as np
import requests
import torch
import gc
device = "cuda" if torch.cuda.is_available() else "cpu"
# Download and Create SAM Model
print("[Downloading SAM Weights]")
SAM_URL = "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth"
r = requests.get(SAM_URL, allow_redirects=True)
print("[Writing SAM Weights]")
with open("./sam_vit_h_4b8939.pth", "wb") as sam_weights:
sam_weights.write(r.content)
del r
gc.collect()
sam = sam_model_registry["vit_h"](checkpoint="./sam_vit_h_4b8939.pth").to(device)
mask_generator = SamAutomaticMaskGenerator(sam)
gc.collect()
# Create ControlNet Pipeline
print("Creating ControlNet Pipeline")
controlnet = ControlNetModel.from_pretrained(
"mfidabel/controlnet-segment-anything", torch_dtype=torch.float16
).to(device)
pipe = StableDiffusionControlNetPipeline.from_pretrained(
"runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16, safety_check=None
).to(device)
# Description
title = "# 🧨 ControlNet on Segment Anything 🤗"
description = """This is a demo on 🧨 ControlNet based on Meta's [Segment Anything Model](https://segment-anything.com/).
Upload an Image, Segment it with Segment Anything, write a prompt, and generate images 🤗
⌛️ It takes about 20~ seconds to generate 4 samples, to get faster results, don't forget to reduce the Nº Samples to 1.
You can obtain the Segmentation Map of any Image through this Colab: [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mfidabel/JAX_SPRINT_2023/blob/main/Segment_Anything_JAX_SPRINT.ipynb)
A huge thanks goes out to @GoogleCloud, for providing us with powerful TPUs that enabled us to train this model; and to the @HuggingFace Team for organizing the sprint.
Check out our [Model Card 🧨](https://huggingface.co/mfidabel/controlnet-segment-anything)
"""
about = """
# 👨💻 About the model
This [model](https://huggingface.co/mfidabel/controlnet-segment-anything) is based on the [ControlNet Model](https://huggingface.co/blog/controlnet), which allow us to generate Images using some sort of condition image. For this model, we selected the segmentation maps produced by Meta's new segmentation model called [Segment Anything Model](https://github.com/facebookresearch/segment-anything) as the condition image. We then trained the model to generate images based on the structure of the segmentation maps and the text prompts given.
# 💾 About the dataset
For the training, we generated a segmented dataset based on the [COYO-700M](https://huggingface.co/datasets/kakaobrain/coyo-700m) dataset. The dataset provided us with the images, and the text prompts. For the segmented images, we used [Segment Anything Model](https://github.com/facebookresearch/segment-anything). We then created 8k samples to train our model on, which isn't a lot, but as a team, we have been very busy with many other responsibilities and time constraints, which made it challenging to dedicate a lot of time to generating a larger dataset. Despite the constraints we faced, we have still managed to achieve some nice results 🙌
You can check the generated datasets below ⬇️
- [sam-coyo-2k](https://huggingface.co/datasets/mfidabel/sam-coyo-2k)
- [sam-coyo-2.5k](https://huggingface.co/datasets/mfidabel/sam-coyo-2.5k)
- [sam-coyo-3k](https://huggingface.co/datasets/mfidabel/sam-coyo-3k)
"""
gif_html = """ <img src="https://github.com/mfidabel/JAX_SPRINT_2023/blob/8632f0fde7388d7a4fc57225c96ef3b8411b3648/EX_1.gif?raw=true" alt= “” height="50%" class="about"> """
examples = [["photo of a futuristic dining table, high quality, tricolor", "low quality, deformed, blurry, points", "examples/condition_image_1.jpeg"],
["a monochrome photo of henry cavil using a shirt, high quality", "low quality, low res, deformed", "examples/condition_image_2.jpeg"],
["photo of a japanese living room, high quality, coherent", "low quality, colors, saturation, extreme brightness, blurry, low res", "examples/condition_image_3.jpeg"],
["living room, detailed, high quality", "low quality, low resolution, render, oversaturated, low contrast", "examples/condition_image_4.jpeg"],
["painting of the bodiam castle, Vicent Van Gogh style, Starry Night", "low quality, low resolution, render, oversaturated, low contrast", "examples/condition_image_5.jpeg"],
["painting of food, olive oil can, purple wine, green cabbage, chili peppers, pablo picasso style, high quality", "low quality, low resolution, render, oversaturated, low contrast, realistic", "examples/condition_image_6.jpeg"],
["Katsushika Hokusai painting of mountains, a sky and desert landscape, The Great Wave off Kanagawa style, colorful",
"low quality, low resolution, render, oversaturated, low contrast, realistic", "examples/condition_image_7.jpeg"]]
default_example = examples[4]
examples = examples[::-1]
css = "h1 { text-align: center } .about { text-align: justify; padding-left: 10%; padding-right: 10%; }"
# Inference Function
def show_anns(anns):
if len(anns) == 0:
return
sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
h, w = anns[0]['segmentation'].shape
final_img = Image.fromarray(np.zeros((h, w, 3), dtype=np.uint8), mode="RGB")
for ann in sorted_anns:
m = ann['segmentation']
img = np.empty((m.shape[0], m.shape[1], 3), dtype=np.uint8)
for i in range(3):
img[:,:,i] = np.random.randint(255, dtype=np.uint8)
final_img.paste(Image.fromarray(img, mode="RGB"), (0, 0), Image.fromarray(np.uint8(m*255)))
return final_img
def segment_image(image, seed = 0):
# Generate Masks
np.random.seed(int(seed))
masks = mask_generator.generate(image)
torch.cuda.empty_cache()
# Create map
map = show_anns(masks)
del masks
gc.collect()
torch.cuda.empty_cache()
return map
def infer(prompts, negative_prompts, image, num_inference_steps = 50, seed = 4, num_samples = 4):
try:
# Segment Image
print("Segmenting Everything")
segmented_map = segment_image(image, seed)
yield segmented_map, [Image.fromarray(np.zeros((512, 512, 3), dtype=np.uint8))] * num_samples
# Generate
rng = torch.Generator(device="cpu").manual_seed(seed)
num_inference_steps = int(num_inference_steps)
print(f"Generating Prompt: {prompts} \nNegative Prompt: {negative_prompts} \nSamples:{num_samples}")
output = pipe([prompts] * num_samples,
[segmented_map] * num_samples,
negative_prompt = [negative_prompts] * num_samples,
generator = rng,
num_inference_steps = num_inference_steps)
final_image = output.images
del output
except Exception as e:
print("Error: " + str(e))
final_image = segmented_map = [np.zeros((512, 512, 3), dtype=np.uint8)] * num_samples
finally:
gc.collect()
torch.cuda.empty_cache()
yield segmented_map, final_image
cond_img = gr.Image(label="Input", shape=(512, 512), value=default_example[2])\
.style(height=400)
segm_img = gr.Image(label="Segmented Image", shape=(512, 512), interactive=False)\
.style(height=400)
output = gr.Gallery(label="Generated images")\
.style(height=200, rows=[2], columns=[2], object_fit="contain")
prompt = gr.Textbox(lines=1, label="Prompt", value=default_example[0])
negative_prompt = gr.Textbox(lines=1, label="Negative Prompt", value=default_example[1])
with gr.Blocks(css=css) as demo:
with gr.Row():
with gr.Column():
# Title
gr.Markdown(title)
# Description
gr.Markdown(description)
with gr.Column():
# Examples
gr.Markdown(gif_html)
# Images
with gr.Row(variant="panel"):
with gr.Column(scale=1):
cond_img.render()
with gr.Column(scale=1):
segm_img.render()
with gr.Column(scale=1):
output.render()
# Submit & Clear
with gr.Row():
with gr.Column():
prompt.render()
negative_prompt.render()
with gr.Column():
with gr.Accordion("Advanced options", open=False):
num_steps = gr.Slider(10, 60, 50, step=1, label="Steps")
seed = gr.Slider(0, 1024, 4, step=1, label="Seed")
num_samples = gr.Slider(1, 4, 4, step=1, label="Nº Samples")
segment_btn = gr.Button("Segment")
submit = gr.Button("Segment & Generate Images")
# TODO: Download Button
with gr.Row():
with gr.Column():
gr.Markdown("Try some of the examples below ⬇️")
gr.Examples(examples=examples,
inputs=[prompt, negative_prompt, cond_img],
outputs=output,
fn=infer,
examples_per_page=4)
with gr.Column():
gr.Markdown(about, elem_classes="about")
submit.click(infer,
inputs=[prompt, negative_prompt, cond_img, num_steps, seed, num_samples],
outputs = [segm_img, output])
segment_btn.click(segment_image,
inputs=[cond_img, seed],
outputs=segm_img)
demo.queue()
demo.launch() |