Spaces:
Running
on
Zero
Running
on
Zero
Improved user interface
Browse files- app.py +40 -13
- bounded_attention.py +39 -3
app.py
CHANGED
@@ -20,6 +20,7 @@ WHITE = 255
|
|
20 |
COLORS = ["red", "blue", "green", "orange", "purple", "turquoise", "olive"]
|
21 |
|
22 |
PROMPT1 = "a ginger kitten and a gray puppy in a yard"
|
|
|
23 |
SUBJECT_TOKEN_INDICES1 = "2,3;6,7"
|
24 |
FILTER_TOKEN_INDICES1 = "1,4,5,8,9"
|
25 |
NUM_TOKENS1 = "10"
|
@@ -158,6 +159,7 @@ FOOTNOTE = """
|
|
158 |
def inference(
|
159 |
boxes,
|
160 |
prompts,
|
|
|
161 |
subject_token_indices,
|
162 |
filter_token_indices,
|
163 |
num_tokens,
|
@@ -190,9 +192,10 @@ def inference(
|
|
190 |
editor = BoundedAttention(
|
191 |
boxes,
|
192 |
prompts,
|
193 |
-
subject_token_indices,
|
194 |
list(range(70, 82)),
|
195 |
list(range(70, 82)),
|
|
|
|
|
196 |
filter_token_indices=filter_token_indices,
|
197 |
eos_token_index=eos_token_index,
|
198 |
cross_loss_coef=cross_loss_scale,
|
@@ -214,6 +217,7 @@ def inference(
|
|
214 |
@spaces.GPU(duration=340)
|
215 |
def generate(
|
216 |
prompt,
|
|
|
217 |
subject_token_indices,
|
218 |
filter_token_indices,
|
219 |
num_tokens,
|
@@ -231,27 +235,45 @@ def generate(
|
|
231 |
seed,
|
232 |
boxes,
|
233 |
):
|
234 |
-
|
|
|
235 |
subject_token_indices = convert_token_indices(subject_token_indices, nested=True)
|
236 |
-
if
|
|
|
|
|
|
|
|
|
|
|
237 |
raise gr.Error("""
|
238 |
The number of boxes should be equal to the number of subjects.
|
239 |
Number of boxes drawn: {}, number of subjects: {}.
|
240 |
-
""".format(len(boxes),
|
241 |
|
242 |
filter_token_indices = convert_token_indices(filter_token_indices) if len(filter_token_indices.strip()) > 0 else None
|
243 |
num_tokens = int(num_tokens) if len(num_tokens.strip()) > 0 else None
|
244 |
prompts = [prompt.strip(".").strip(",").strip()] * batch_size
|
245 |
|
246 |
images = inference(
|
247 |
-
boxes, prompts, subject_token_indices, filter_token_indices, num_tokens, init_step_size,
|
248 |
final_step_size, first_refinement_step, num_clusters_per_subject, cross_loss_scale, self_loss_scale,
|
249 |
classifier_free_guidance_scale, num_iterations, loss_threshold, num_guidance_steps, seed)
|
250 |
|
251 |
return images
|
252 |
|
253 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
254 |
def convert_token_indices(token_indices, nested=False):
|
|
|
|
|
|
|
|
|
255 |
if nested:
|
256 |
return [convert_token_indices(indices, nested=False) for indices in token_indices.split(";")]
|
257 |
|
@@ -331,8 +353,13 @@ def main():
|
|
331 |
placeholder=PROMPT1,
|
332 |
)
|
333 |
|
|
|
|
|
|
|
|
|
|
|
334 |
subject_token_indices = gr.Textbox(
|
335 |
-
label="The token indices of each subject (separate indices for the same subject with commas, and for different subjects with semicolons)",
|
336 |
placeholder=SUBJECT_TOKEN_INDICES1,
|
337 |
)
|
338 |
|
@@ -393,7 +420,7 @@ def main():
|
|
393 |
generate_image_button.click(
|
394 |
fn=generate,
|
395 |
inputs=[
|
396 |
-
prompt, subject_token_indices, filter_token_indices, num_tokens,
|
397 |
init_step_size, final_step_size, first_refinement_step, num_clusters_per_subject, cross_loss_scale, self_loss_scale,
|
398 |
classifier_free_guidance_scale, batch_size, num_iterations, loss_threshold, num_guidance_steps,
|
399 |
seed,
|
@@ -407,31 +434,31 @@ def main():
|
|
407 |
gr.Examples(
|
408 |
examples=[
|
409 |
[
|
410 |
-
PROMPT1, SUBJECT_TOKEN_INDICES1, FILTER_TOKEN_INDICES1, NUM_TOKENS1,
|
411 |
15, 10, 15, 3, 1, 1,
|
412 |
7.5, 1, 5, 0.2, 8,
|
413 |
12,
|
414 |
],
|
415 |
[
|
416 |
-
PROMPT2, "7,8,17;11,12,17;15,16,17", "5,6,9,10,13,14,18,19", "21",
|
417 |
25, 18, 15, 3, 1, 1,
|
418 |
7.5, 1, 5, 0.2, 8,
|
419 |
286,
|
420 |
],
|
421 |
[
|
422 |
-
PROMPT3, "7;10;13,14;17", "5,6,8,9,11,12,15,16", "17",
|
423 |
18, 12, 15, 3, 1, 1,
|
424 |
7.5, 1, 5, 0.2, 8,
|
425 |
216,
|
426 |
],
|
427 |
[
|
428 |
-
PROMPT4, "9,10;13,14;17", "1,4,5,7,8,11,12,15,16", "17",
|
429 |
25, 18, 15, 3, 1, 1,
|
430 |
7.5, 1, 5, 0.2, 8,
|
431 |
82,
|
432 |
],
|
433 |
[
|
434 |
-
PROMPT5, "2,3;6,7;10,11;14,15;18,19", "1,4,5,8,9,12,13,16,17,20,21", "22",
|
435 |
18, 12, 15, 3, 1, 1,
|
436 |
7.5, 1, 5, 0.2, 8,
|
437 |
152,
|
@@ -439,7 +466,7 @@ def main():
|
|
439 |
],
|
440 |
fn=build_example_layout,
|
441 |
inputs=[
|
442 |
-
prompt, subject_token_indices, filter_token_indices, num_tokens,
|
443 |
init_step_size, final_step_size, first_refinement_step, num_clusters_per_subject, cross_loss_scale, self_loss_scale,
|
444 |
classifier_free_guidance_scale, batch_size, num_iterations, loss_threshold, num_guidance_steps,
|
445 |
seed,
|
|
|
20 |
COLORS = ["red", "blue", "green", "orange", "purple", "turquoise", "olive"]
|
21 |
|
22 |
PROMPT1 = "a ginger kitten and a gray puppy in a yard"
|
23 |
+
SUBJECT_SUB_PROMPTS1 = "ginger kitten;gray puppy"
|
24 |
SUBJECT_TOKEN_INDICES1 = "2,3;6,7"
|
25 |
FILTER_TOKEN_INDICES1 = "1,4,5,8,9"
|
26 |
NUM_TOKENS1 = "10"
|
|
|
159 |
def inference(
|
160 |
boxes,
|
161 |
prompts,
|
162 |
+
subject_sub_prompts,
|
163 |
subject_token_indices,
|
164 |
filter_token_indices,
|
165 |
num_tokens,
|
|
|
192 |
editor = BoundedAttention(
|
193 |
boxes,
|
194 |
prompts,
|
|
|
195 |
list(range(70, 82)),
|
196 |
list(range(70, 82)),
|
197 |
+
subject_sub_prompts=subject_sub_prompts,
|
198 |
+
subject_token_indices=subject_token_indices,
|
199 |
filter_token_indices=filter_token_indices,
|
200 |
eos_token_index=eos_token_index,
|
201 |
cross_loss_coef=cross_loss_scale,
|
|
|
217 |
@spaces.GPU(duration=340)
|
218 |
def generate(
|
219 |
prompt,
|
220 |
+
subject_sub_prompts,
|
221 |
subject_token_indices,
|
222 |
filter_token_indices,
|
223 |
num_tokens,
|
|
|
235 |
seed,
|
236 |
boxes,
|
237 |
):
|
238 |
+
num_subjects = 0
|
239 |
+
subject_sub_prompts = convert_sub_prompts(subject_sub_prompts)
|
240 |
subject_token_indices = convert_token_indices(subject_token_indices, nested=True)
|
241 |
+
if subject_sub_prompts is not None:
|
242 |
+
num_subjects = len(subject_sub_prompts)
|
243 |
+
if subject_token_indices is not None:
|
244 |
+
num_subjects = len(subject_token_indices)
|
245 |
+
|
246 |
+
if len(boxes) != num_subjects:
|
247 |
raise gr.Error("""
|
248 |
The number of boxes should be equal to the number of subjects.
|
249 |
Number of boxes drawn: {}, number of subjects: {}.
|
250 |
+
""".format(len(boxes), nun_subjects))
|
251 |
|
252 |
filter_token_indices = convert_token_indices(filter_token_indices) if len(filter_token_indices.strip()) > 0 else None
|
253 |
num_tokens = int(num_tokens) if len(num_tokens.strip()) > 0 else None
|
254 |
prompts = [prompt.strip(".").strip(",").strip()] * batch_size
|
255 |
|
256 |
images = inference(
|
257 |
+
boxes, prompts, subject_sub_prompts, subject_token_indices, filter_token_indices, num_tokens, init_step_size,
|
258 |
final_step_size, first_refinement_step, num_clusters_per_subject, cross_loss_scale, self_loss_scale,
|
259 |
classifier_free_guidance_scale, num_iterations, loss_threshold, num_guidance_steps, seed)
|
260 |
|
261 |
return images
|
262 |
|
263 |
|
264 |
+
def convert_sub_prompts(sub_prompts):
|
265 |
+
sub_prompts = sub_prompts.strip()
|
266 |
+
if len(sub_prompts) == 0:
|
267 |
+
return None
|
268 |
+
|
269 |
+
return [sub_prompt.strip() for sub_prompt in sub_prompts.split(";")]
|
270 |
+
|
271 |
+
|
272 |
def convert_token_indices(token_indices, nested=False):
|
273 |
+
token_indices = token_indices.strip()
|
274 |
+
if len(token_indices) == 0:
|
275 |
+
return None
|
276 |
+
|
277 |
if nested:
|
278 |
return [convert_token_indices(indices, nested=False) for indices in token_indices.split(";")]
|
279 |
|
|
|
353 |
placeholder=PROMPT1,
|
354 |
)
|
355 |
|
356 |
+
subject_sub_prompts = gr.Textbox(
|
357 |
+
label="Sub-prompts for each subject (separate with semicolons)",
|
358 |
+
placeholder=SUBJECT_SUB_PROMPTS1,
|
359 |
+
)
|
360 |
+
|
361 |
subject_token_indices = gr.Textbox(
|
362 |
+
label="Optional: The token indices of each subject (separate indices for the same subject with commas, and for different subjects with semicolons)",
|
363 |
placeholder=SUBJECT_TOKEN_INDICES1,
|
364 |
)
|
365 |
|
|
|
420 |
generate_image_button.click(
|
421 |
fn=generate,
|
422 |
inputs=[
|
423 |
+
prompt, subject_sub_prompts, subject_token_indices, filter_token_indices, num_tokens,
|
424 |
init_step_size, final_step_size, first_refinement_step, num_clusters_per_subject, cross_loss_scale, self_loss_scale,
|
425 |
classifier_free_guidance_scale, batch_size, num_iterations, loss_threshold, num_guidance_steps,
|
426 |
seed,
|
|
|
434 |
gr.Examples(
|
435 |
examples=[
|
436 |
[
|
437 |
+
PROMPT1, SUBJECT_SUB_PROMPTS1, SUBJECT_TOKEN_INDICES1, FILTER_TOKEN_INDICES1, NUM_TOKENS1,
|
438 |
15, 10, 15, 3, 1, 1,
|
439 |
7.5, 1, 5, 0.2, 8,
|
440 |
12,
|
441 |
],
|
442 |
[
|
443 |
+
PROMPT2, "cute unicorn;pink hedgehog;nerdy owl", "7,8,17;11,12,17;15,16,17", "5,6,9,10,13,14,18,19", "21",
|
444 |
25, 18, 15, 3, 1, 1,
|
445 |
7.5, 1, 5, 0.2, 8,
|
446 |
286,
|
447 |
],
|
448 |
[
|
449 |
+
PROMPT3, "astronaut;robot;green alien;spaceship", "7;10;13,14;17", "5,6,8,9,11,12,15,16", "17",
|
450 |
18, 12, 15, 3, 1, 1,
|
451 |
7.5, 1, 5, 0.2, 8,
|
452 |
216,
|
453 |
],
|
454 |
[
|
455 |
+
PROMPT4, "semi trailer;concrete mixer;helicopter", "9,10;13,14;17", "1,4,5,7,8,11,12,15,16", "17",
|
456 |
25, 18, 15, 3, 1, 1,
|
457 |
7.5, 1, 5, 0.2, 8,
|
458 |
82,
|
459 |
],
|
460 |
[
|
461 |
+
PROMPT5, "golden retriever;german shepherd;boston terrier;english bulldog;border collie", "2,3;6,7;10,11;14,15;18,19", "1,4,5,8,9,12,13,16,17,20,21", "22",
|
462 |
18, 12, 15, 3, 1, 1,
|
463 |
7.5, 1, 5, 0.2, 8,
|
464 |
152,
|
|
|
466 |
],
|
467 |
fn=build_example_layout,
|
468 |
inputs=[
|
469 |
+
prompt, subject_sub_prompts, subject_token_indices, filter_token_indices, num_tokens,
|
470 |
init_step_size, final_step_size, first_refinement_step, num_clusters_per_subject, cross_loss_scale, self_loss_scale,
|
471 |
classifier_free_guidance_scale, batch_size, num_iterations, loss_threshold, num_guidance_steps,
|
472 |
seed,
|
bounded_attention.py
CHANGED
@@ -21,9 +21,10 @@ class BoundedAttention(injection_utils.AttentionBase):
|
|
21 |
self,
|
22 |
boxes,
|
23 |
prompts,
|
24 |
-
subject_token_indices,
|
25 |
cross_loss_layers,
|
26 |
self_loss_layers,
|
|
|
|
|
27 |
cross_mask_layers=None,
|
28 |
self_mask_layers=None,
|
29 |
eos_token_index=None,
|
@@ -56,6 +57,7 @@ class BoundedAttention(injection_utils.AttentionBase):
|
|
56 |
super().__init__()
|
57 |
self.boxes = boxes
|
58 |
self.prompts = prompts
|
|
|
59 |
self.subject_token_indices = subject_token_indices
|
60 |
self.cross_loss_layers = set(cross_loss_layers)
|
61 |
self.self_loss_layers = set(self_loss_layers)
|
@@ -186,8 +188,9 @@ class BoundedAttention(injection_utils.AttentionBase):
|
|
186 |
self.optimized = False
|
187 |
return latents
|
188 |
|
189 |
-
def _tokenize(self):
|
190 |
-
|
|
|
191 |
tokens = self.model.tokenizer.convert_ids_to_tokens(ids, skip_special_tokens=True)
|
192 |
return [token[:-4] for token in tokens] # remove ending </w>
|
193 |
|
@@ -195,6 +198,38 @@ class BoundedAttention(injection_utils.AttentionBase):
|
|
195 |
tagged_tokens = nltk.pos_tag(self._tokenize())
|
196 |
return [type(self).TAG_RULES.get(token, tag) for token, tag in tagged_tokens]
|
197 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
198 |
def _determine_eos_token(self):
|
199 |
tokens = self._tokenize()
|
200 |
eos_token_index = len(tokens) + 1
|
@@ -224,6 +259,7 @@ class BoundedAttention(injection_utils.AttentionBase):
|
|
224 |
self.leading_token_indices = leading_token_indices
|
225 |
|
226 |
def _determine_tokens(self):
|
|
|
227 |
self._determine_eos_token()
|
228 |
self._determine_filter_tokens()
|
229 |
self._determine_leading_tokens()
|
|
|
21 |
self,
|
22 |
boxes,
|
23 |
prompts,
|
|
|
24 |
cross_loss_layers,
|
25 |
self_loss_layers,
|
26 |
+
subject_sub_prompts=None,
|
27 |
+
subject_token_indices=None,
|
28 |
cross_mask_layers=None,
|
29 |
self_mask_layers=None,
|
30 |
eos_token_index=None,
|
|
|
57 |
super().__init__()
|
58 |
self.boxes = boxes
|
59 |
self.prompts = prompts
|
60 |
+
self.subject_sub_prompts = subject_sub_prompts
|
61 |
self.subject_token_indices = subject_token_indices
|
62 |
self.cross_loss_layers = set(cross_loss_layers)
|
63 |
self.self_loss_layers = set(self_loss_layers)
|
|
|
188 |
self.optimized = False
|
189 |
return latents
|
190 |
|
191 |
+
def _tokenize(self, prompt=None):
|
192 |
+
prompt = self.prompts[0] if prompt is None else prompt
|
193 |
+
ids = self.model.tokenizer.encode(prompt)
|
194 |
tokens = self.model.tokenizer.convert_ids_to_tokens(ids, skip_special_tokens=True)
|
195 |
return [token[:-4] for token in tokens] # remove ending </w>
|
196 |
|
|
|
198 |
tagged_tokens = nltk.pos_tag(self._tokenize())
|
199 |
return [type(self).TAG_RULES.get(token, tag) for token, tag in tagged_tokens]
|
200 |
|
201 |
+
def _determine_subject_tokens(self):
|
202 |
+
if self.subject_token_indices is not None:
|
203 |
+
return
|
204 |
+
|
205 |
+
if self.subject_sub_prompts is None:
|
206 |
+
raise ValueError('Missing subject sub-prompts.')
|
207 |
+
|
208 |
+
tokens = self._tokenize()
|
209 |
+
|
210 |
+
matches = []
|
211 |
+
self.subject_token_indices = []
|
212 |
+
for sub_prompt in self.subject_sub_prompts:
|
213 |
+
token_indices = self._determine_specific_subject_tokens(tokens, sub_prompt, matches)
|
214 |
+
matches.append(token_indices[0])
|
215 |
+
self.subject_token_indices.append(token_indices)
|
216 |
+
|
217 |
+
def _determine_specific_subject_tokens(self, tokens, sub_prompt, previous_matches):
|
218 |
+
sub_tokens = self._tokenize(sub_prompt)
|
219 |
+
sub_len = len(sub_tokens)
|
220 |
+
|
221 |
+
matches = []
|
222 |
+
for i in range(len(tokens)):
|
223 |
+
if tokens[i] == sub_tokens[0] and tokens[i:i + sub_len] == sub_tokens:
|
224 |
+
matches.append(i + 1)
|
225 |
+
|
226 |
+
if len(matches) == 0:
|
227 |
+
raise ValueError(f'Couldn\'t locate sub-prompt: {sub_prompt}.')
|
228 |
+
|
229 |
+
new_matches = [i for i in matches if i not in previous_matches]
|
230 |
+
last_match = new_matches[0] if len(new_matches) > 0 else matches[-1]
|
231 |
+
return list(range(last_match, last_match + sub_len))
|
232 |
+
|
233 |
def _determine_eos_token(self):
|
234 |
tokens = self._tokenize()
|
235 |
eos_token_index = len(tokens) + 1
|
|
|
259 |
self.leading_token_indices = leading_token_indices
|
260 |
|
261 |
def _determine_tokens(self):
|
262 |
+
self._determine_subject_tokens()
|
263 |
self._determine_eos_token()
|
264 |
self._determine_filter_tokens()
|
265 |
self._determine_leading_tokens()
|