Spaces:

shi-labs
/

Versatile-Diffusion

Runtime error

App Files Files Community

Versatile-Diffusion / app.py

JamesXu

attemp to adapt to newer version of gradio

6eacf38 13 days ago

raw

history blame contribute delete

46.3 kB

	################################################################################
	# Copyright (C) 2023 Xingqian Xu - All Rights Reserved #
	# #
	# Please visit Versatile Diffusion's arXiv paper for more details, link at #
	# arxiv.org/abs/2211.08332 #
	# #
	# Besides, this work is also inspired by many established techniques including:#
	# Denoising Diffusion Probablistic Model; Denoising Diffusion Implicit Model; #
	# Latent Diffusion Model; Stable Diffusion; Stable Diffusion - Img2Img; Stable #
	# Diffusion - Variation; ImageMixer; DreamBooth; Stable Diffusion - Lora; More #
	# Control for Free; Prompt-to-Prompt; #
	# #
	################################################################################

	import gradio as gr
	import os
	import PIL
	from PIL import Image
	from pathlib import Path
	import numpy as np
	import numpy.random as npr
	from contextlib import nullcontext
	import types

	import torch
	import torchvision.transforms as tvtrans
	from lib.cfg_helper import model_cfg_bank
	from lib.model_zoo import get_model
	from cusomized_gradio_blocks import create_myexamples, customized_as_example, customized_postprocess

	n_sample_image = 2
	n_sample_text = 4
	cache_examples = True

	from lib.model_zoo.ddim import DDIMSampler

	##########
	# helper #
	##########

	def highlight_print(info):
	print('')
	print(''.join(['#']*(len(info)+4)))
	print('# '+info+' #')
	print(''.join(['#']*(len(info)+4)))
	print('')

	def decompose(x, q=20, niter=100):
	x_mean = x.mean(-1, keepdim=True)
	x_input = x - x_mean
	u, s, v = torch.pca_lowrank(x_input, q=q, center=False, niter=niter)
	ss = torch.stack([torch.diag(si) for si in s])
	x_lowrank = torch.bmm(torch.bmm(u, ss), torch.permute(v, [0, 2, 1]))
	x_remain = x_input - x_lowrank
	return u, s, v, x_mean, x_remain

	class adjust_rank(object):
	def __init__(self, max_drop_rank=[1, 5], q=20):
	self.max_semantic_drop_rank = max_drop_rank[0]
	self.max_style_drop_rank = max_drop_rank[1]
	self.q = q

	def t2y0_semf_wrapper(t0, y00, t1, y01):
	return lambda t: (np.exp((t-0.5)2)-t0)/(t1-t0)(y01-y00)+y00
	t0, y00 = np.exp((0 -0.5)*2), -self.max_semantic_drop_rank
	t1, y01 = np.exp((0.5-0.5)*2), 1
	self.t2y0_semf = t2y0_semf_wrapper(t0, y00, t1, y01)

	def x2y_semf_wrapper(x0, x1, y1):
	return lambda x, y0: (x-x0)/(x1-x0)*(y1-y0)+y0
	x0 = 0
	x1, y1 = self.max_semantic_drop_rank+1, 1
	self.x2y_semf = x2y_semf_wrapper(x0, x1, y1)

	def t2y0_styf_wrapper(t0, y00, t1, y01):
	return lambda t: (np.exp((t-0.5)2)-t0)/(t1-t0)(y01-y00)+y00
	t0, y00 = np.exp((1 -0.5)*2), -(q-self.max_style_drop_rank)
	t1, y01 = np.exp((0.5-0.5)*2), 1
	self.t2y0_styf = t2y0_styf_wrapper(t0, y00, t1, y01)

	def x2y_styf_wrapper(x0, x1, y1):
	return lambda x, y0: (x-x0)/(x1-x0)*(y1-y0)+y0
	x0 = q-1
	x1, y1 = self.max_style_drop_rank-1, 1
	self.x2y_styf = x2y_styf_wrapper(x0, x1, y1)

	def __call__(self, x, lvl):
	if lvl == 0.5:
	return x

	if x.dtype == torch.float16:
	fp16 = True
	x = x.float()
	else:
	fp16 = False
	std_save = x.std(axis=[-2, -1])

	u, s, v, x_mean, x_remain = decompose(x, q=self.q)

	if lvl < 0.5:
	assert lvl>=0
	for xi in range(0, self.max_semantic_drop_rank+1):
	y0 = self.t2y0_semf(lvl)
	yi = self.x2y_semf(xi, y0)
	yi = 0 if yi<0 else yi
	s[:, xi] *= yi

	elif lvl > 0.5:
	assert lvl <= 1
	for xi in range(self.max_style_drop_rank, self.q):
	y0 = self.t2y0_styf(lvl)
	yi = self.x2y_styf(xi, y0)
	yi = 0 if yi<0 else yi
	s[:, xi] *= yi
	x_remain = 0

	ss = torch.stack([torch.diag(si) for si in s])
	x_lowrank = torch.bmm(torch.bmm(u, ss), torch.permute(v, [0, 2, 1]))
	x_new = x_lowrank + x_mean + x_remain

	std_new = x_new.std(axis=[-2, -1])
	x_new = x_new / std_new * std_save

	if fp16:
	x_new = x_new.half()

	return x_new

	def remove_duplicate_word(tx):
	def combine_words(input, length):
	combined_inputs = []
	if len(splitted_input)>1:
	for i in range(len(input)-1):
	combined_inputs.append(input[i]+" "+last_word_of(splitted_input[i+1],length)) #add the last word of the right-neighbour (overlapping) sequence (before it has expanded), which is the next word in the original sentence
	return combined_inputs, length+1

	def remove_duplicates(input, length):
	bool_broke=False #this means we didn't find any duplicates here
	for i in range(len(input) - length):
	if input[i]==input[i + length]: #found a duplicate piece of sentence!
	for j in range(0, length): #remove the overlapping sequences in reverse order
	del input[i + length - j]
	bool_broke = True
	break #break the for loop as the loop length does not matches the length of splitted_input anymore as we removed elements
	if bool_broke:
	return remove_duplicates(input, length) #if we found a duplicate, look for another duplicate of the same length
	return input

	def last_word_of(input, length):
	splitted = input.split(" ")
	if len(splitted)==0:
	return input
	else:
	return splitted[length-1]

	def split_and_puncsplit(text):
	tx = text.split(" ")
	txnew = []
	for txi in tx:
	txqueue=[]
	while True:
	if txi[0] in '([{':
	txqueue.extend([txi[:1], '<puncnext>'])
	txi = txi[1:]
	if len(txi) == 0:
	break
	else:
	break
	txnew += txqueue
	txstack=[]
	if len(txi) == 0:
	continue
	while True:
	if txi[-1] in '?!.,:;}])':
	txstack = ['<puncnext>', txi[-1:]] + txstack
	txi = txi[:-1]
	if len(txi) == 0:
	break
	else:
	break
	if len(txi) != 0:
	txnew += [txi]
	txnew += txstack
	return txnew

	if tx == '':
	return tx

	splitted_input = split_and_puncsplit(tx)
	word_length = 1
	intermediate_output = False
	while len(splitted_input)>1:
	splitted_input = remove_duplicates(splitted_input, word_length)
	if len(splitted_input)>1:
	splitted_input, word_length = combine_words(splitted_input, word_length)
	if intermediate_output:
	print(splitted_input)
	print(word_length)
	output = splitted_input[0]
	output = output.replace(' <puncnext> ', '')
	return output

	def get_instruction(mode):
	t2i_instruction = ["Generate image from text prompt."]
	i2i_instruction = ["Generate image conditioned on reference image.",]
	i2t_instruction = ["Generate text from reference image. "]
	t2t_instruction = ["Generate text from reference text prompt. "]
	dcg_instruction = ["Generate image conditioned on both text and image."]
	tcg_instruction = ["Generate image conditioned on text and up to two images."]
	mcg_instruction = ["Generate image from multiple contexts."]

	if mode == "Text-to-Image":
	return '\n'.join(t2i_instruction)
	elif mode == "Image-Variation":
	return '\n'.join(i2i_instruction)
	elif mode == "Image-to-Text":
	return '\n'.join(i2t_instruction)
	elif mode == "Text-Variation":
	return '\n'.join(t2t_instruction)
	elif mode == "Dual-Context":
	return '\n'.join(dcg_instruction)
	elif mode == "Triple-Context":
	return '\n'.join(tcg_instruction)
	elif mode == "Multi-Context":
	return '\n'.join(mcg_instruction)
	else:
	assert False

	########
	# main #
	########
	class vd_dummy(object):
	def __init__(self, args, *kwarg):
	self.which = 'Vdummy'
	def inference_t2i(self, args, *kwarg): pass
	def inference_i2i(self, args, *kwarg): pass
	def inference_i2t(self, args, *kwarg): pass
	def inference_t2t(self, args, *kwarg): pass
	def inference_dcg(self, args, *kwarg): pass
	def inference_tcg(self, args, *kwarg): pass
	def inference_mcg(self, args, *kwarg):
	return None, None

	class vd_inference(object):
	def __init__(self, fp16=False, which='v2.0'):
	highlight_print(which)
	self.which = which

	if self.which == 'v1.0':
	cfgm = model_cfg_bank()('vd_four_flow_v1-0')
	else:
	assert False, 'Model type not supported'
	net = get_model()(cfgm)

	if fp16:
	highlight_print('Running in FP16')
	if self.which == 'v1.0':
	net.ctx['text'].fp16 = True
	net.ctx['image'].fp16 = True
	net = net.half()
	self.dtype = torch.float16
	else:
	self.dtype = torch.float32

	if self.which == 'v1.0':
	# if fp16:
	# sd = torch.load('pretrained/vd-four-flow-v1-0-fp16.pth', map_location='cpu')
	# else:
	# sd = torch.load('pretrained/vd-four-flow-v1-0.pth', map_location='cpu')
	from huggingface_hub import hf_hub_download
	if fp16:
	temppath = hf_hub_download('shi-labs/versatile-diffusion-model', 'pretrained_pth/vd-four-flow-v1-0-fp16.pth')
	else:
	temppath = hf_hub_download('shi-labs/versatile-diffusion-model', 'pretrained_pth/vd-four-flow-v1-0.pth')
	sd = torch.load(temppath, map_location='cpu')

	net.load_state_dict(sd, strict=False)

	self.use_cuda = torch.cuda.is_available()
	if self.use_cuda:
	net.to('cuda')
	self.net = net
	self.sampler = DDIMSampler(net)

	self.output_dim = [512, 512]
	self.n_sample_image = n_sample_image
	self.n_sample_text = n_sample_text
	self.ddim_steps = 50
	self.ddim_eta = 0.0
	self.scale_textto = 7.5
	self.image_latent_dim = 4
	self.text_latent_dim = 768
	self.text_temperature = 1

	if which == 'v1.0':
	self.adjust_rank_f = adjust_rank(max_drop_rank=[1, 5], q=20)
	self.scale_imgto = 7.5
	self.disentanglement_noglobal = True

	def inference_t2i(self, text, seed):
	n_samples = self.n_sample_image
	scale = self.scale_textto
	sampler = self.sampler
	h, w = self.output_dim
	u = self.net.ctx_encode([""], which='text').repeat(n_samples, 1, 1)
	c = self.net.ctx_encode([text], which='text').repeat(n_samples, 1, 1)
	shape = [n_samples, self.image_latent_dim, h//8, w//8]
	np.random.seed(seed)
	torch.manual_seed(seed + 100)
	x, _ = sampler.sample(
	steps=self.ddim_steps,
	x_info={'type':'image'},
	c_info={'type':'text', 'conditioning':c, 'unconditional_conditioning':u,
	'unconditional_guidance_scale':scale},
	shape=shape,
	verbose=False,
	eta=self.ddim_eta)
	im = self.net.vae_decode(x, which='image')
	im = [tvtrans.ToPILImage()(i) for i in im]
	return im

	def inference_i2i(self, im, fid_lvl, fcs_lvl, clr_adj, seed):
	n_samples = self.n_sample_image
	scale = self.scale_imgto
	sampler = self.sampler
	h, w = self.output_dim
	device = self.net.device

	BICUBIC = PIL.Image.Resampling.BICUBIC
	im = im.resize([w, h], resample=BICUBIC)

	if fid_lvl == 1:
	return [im]*n_samples

	cx = tvtrans.ToTensor()(im)[None].to(device).to(self.dtype)

	c = self.net.ctx_encode(cx, which='image')
	if self.disentanglement_noglobal:
	c_glb = c[:, 0:1]
	c_loc = c[:, 1: ]
	c_loc = self.adjust_rank_f(c_loc, fcs_lvl)
	c = torch.cat([c_glb, c_loc], dim=1).repeat(n_samples, 1, 1)
	else:
	c = self.adjust_rank_f(c, fcs_lvl).repeat(n_samples, 1, 1)
	u = torch.zeros_like(c)

	shape = [n_samples, self.image_latent_dim, h//8, w//8]
	np.random.seed(seed)
	torch.manual_seed(seed + 100)
	if fid_lvl!=0:
	x0 = self.net.vae_encode(cx, which='image').repeat(n_samples, 1, 1, 1)
	step = int(self.ddim_steps * (1-fid_lvl))
	x, _ = sampler.sample(
	steps=self.ddim_steps,
	x_info={'type':'image', 'x0':x0, 'x0_forward_timesteps':step},
	c_info={'type':'image', 'conditioning':c, 'unconditional_conditioning':u,
	'unconditional_guidance_scale':scale},
	shape=shape,
	verbose=False,
	eta=self.ddim_eta)
	else:
	x, _ = sampler.sample(
	steps=self.ddim_steps,
	x_info={'type':'image',},
	c_info={'type':'image', 'conditioning':c, 'unconditional_conditioning':u,
	'unconditional_guidance_scale':scale},
	shape=shape,
	verbose=False,
	eta=self.ddim_eta)

	imout = self.net.vae_decode(x, which='image')

	if clr_adj == 'Simple':
	cx_mean = cx.view(3, -1).mean(-1)[:, None, None]
	cx_std = cx.view(3, -1).std(-1)[:, None, None]
	imout_mean = [imouti.view(3, -1).mean(-1)[:, None, None] for imouti in imout]
	imout_std = [imouti.view(3, -1).std(-1)[:, None, None] for imouti in imout]
	imout = [(ii-mi)/si*cx_std+cx_mean for ii, mi, si in zip(imout, imout_mean, imout_std)]
	imout = [torch.clamp(ii, 0, 1) for ii in imout]

	imout = [tvtrans.ToPILImage()(i) for i in imout]
	return imout

	def inference_i2t(self, im, seed):
	n_samples = self.n_sample_text
	scale = self.scale_imgto
	sampler = self.sampler
	h, w = self.output_dim
	device = self.net.device

	BICUBIC = PIL.Image.Resampling.BICUBIC
	im = im.resize([w, h], resample=BICUBIC)

	cx = tvtrans.ToTensor()(im)[None].to(device)
	c = self.net.ctx_encode(cx, which='image').repeat(n_samples, 1, 1)
	u = self.net.ctx_encode(torch.zeros_like(cx), which='image').repeat(n_samples, 1, 1)

	shape = [n_samples, self.text_latent_dim]
	np.random.seed(seed)
	torch.manual_seed(seed + 100)
	x, _ = sampler.sample(
	steps=self.ddim_steps,
	x_info={'type':'text',},
	c_info={'type':'image', 'conditioning':c, 'unconditional_conditioning':u,
	'unconditional_guidance_scale':scale},
	shape=shape,
	verbose=False,
	eta=self.ddim_eta)
	tx = self.net.vae_decode(x, which='text', temperature=self.text_temperature)
	tx = [remove_duplicate_word(txi) for txi in tx]
	tx_combined = '\n'.join(tx)
	return tx_combined

	def inference_t2t(self, text, seed):
	n_samples = self.n_sample_text
	scale = self.scale_textto
	sampler = self.sampler
	u = self.net.ctx_encode([""], which='text').repeat(n_samples, 1, 1)
	c = self.net.ctx_encode([text], which='text').repeat(n_samples, 1, 1)
	shape = [n_samples, self.text_latent_dim]
	np.random.seed(seed)
	torch.manual_seed(seed + 100)
	x, _ = sampler.sample(
	steps=self.ddim_steps,
	x_info={'type':'text',},
	c_info={'type':'text', 'conditioning':c, 'unconditional_conditioning':u,
	'unconditional_guidance_scale':scale},
	shape=shape,
	verbose=False,
	eta=self.ddim_eta)
	tx = self.net.vae_decode(x, which='text', temperature=self.text_temperature)
	tx = [remove_duplicate_word(txi) for txi in tx]
	tx_combined = '\n'.join(tx)
	return tx_combined

	def inference_dcg(self, imctx, fcs_lvl, textctx, textstrength, seed):
	n_samples = self.n_sample_image
	sampler = self.sampler
	h, w = self.output_dim
	device = self.net.device

	c_info_list = []

	if (textctx is not None) and (textctx != "") and (textstrength != 0):
	ut = self.net.ctx_encode([""], which='text').repeat(n_samples, 1, 1)
	ct = self.net.ctx_encode([textctx], which='text').repeat(n_samples, 1, 1)
	scale = self.scale_imgto(1-textstrength) + self.scale_texttotextstrength

	c_info_list.append({
	'type':'text',
	'conditioning':ct,
	'unconditional_conditioning':ut,
	'unconditional_guidance_scale':scale,
	'ratio': textstrength, })
	else:
	scale = self.scale_imgto
	textstrength = 0

	BICUBIC = PIL.Image.Resampling.BICUBIC
	cx = imctx.resize([w, h], resample=BICUBIC)
	cx = tvtrans.ToTensor()(cx)[None].to(device).to(self.dtype)
	ci = self.net.ctx_encode(cx, which='image')

	if self.disentanglement_noglobal:
	ci_glb = ci[:, 0:1]
	ci_loc = ci[:, 1: ]
	ci_loc = self.adjust_rank_f(ci_loc, fcs_lvl)
	ci = torch.cat([ci_glb, ci_loc], dim=1).repeat(n_samples, 1, 1)
	else:
	ci = self.adjust_rank_f(ci, fcs_lvl).repeat(n_samples, 1, 1)

	c_info_list.append({
	'type':'image',
	'conditioning':ci,
	'unconditional_conditioning':torch.zeros_like(ci),
	'unconditional_guidance_scale':scale,
	'ratio': (1-textstrength), })

	shape = [n_samples, self.image_latent_dim, h//8, w//8]
	np.random.seed(seed)
	torch.manual_seed(seed + 100)
	x, _ = sampler.sample_multicontext(
	steps=self.ddim_steps,
	x_info={'type':'image',},
	c_info_list=c_info_list,
	shape=shape,
	verbose=False,
	eta=self.ddim_eta)

	imout = self.net.vae_decode(x, which='image')
	imout = [tvtrans.ToPILImage()(i) for i in imout]
	return imout

	def inference_tcg(self, *args):
	args_imag = list(args[0:10]) + [None, None, None, None, None]*2
	args_rest = args[10:]
	imin, imout = self.inference_mcg(args_imag, args_rest)
	return imin, imout

	def inference_mcg(self, *args):
	imctx = [args[0:5], args[5:10], args[10:15], args[15:20]]
	textctx, textstrength, seed = args[20:]

	n_samples = self.n_sample_image
	sampler = self.sampler
	h, w = self.output_dim
	device = self.net.device

	c_info_list = []

	if (textctx is not None) and (textctx != "") and (textstrength != 0):
	ut = self.net.ctx_encode([""], which='text').repeat(n_samples, 1, 1)
	ct = self.net.ctx_encode([textctx], which='text').repeat(n_samples, 1, 1)
	scale = self.scale_imgto(1-textstrength) + self.scale_texttotextstrength

	c_info_list.append({
	'type':'text',
	'conditioning':ct,
	'unconditional_conditioning':ut,
	'unconditional_guidance_scale':scale,
	'ratio': textstrength, })
	else:
	scale = self.scale_imgto
	textstrength = 0

	input_save = []
	imc = []
	for im, imm, strength, fcs_lvl, use_mask in imctx:
	if (im is None) and (imm is None):
	continue
	BILINEAR = PIL.Image.Resampling.BILINEAR
	BICUBIC = PIL.Image.Resampling.BICUBIC
	if use_mask:
	cx = imm['image'].resize([w, h], resample=BICUBIC)
	cx = tvtrans.ToTensor()(cx)[None].to(self.dtype).to(device)
	m = imm['mask'].resize([w, h], resample=BILINEAR)
	m = tvtrans.ToTensor()(m)[None, 0:1].to(self.dtype).to(device)
	m = (1-m)
	cx_show = cx*m
	ci = self.net.ctx_encode(cx, which='image', masks=m)
	else:
	cx = im.resize([w, h], resample=BICUBIC)
	cx = tvtrans.ToTensor()(cx)[None].to(self.dtype).to(device)
	ci = self.net.ctx_encode(cx, which='image')
	cx_show = cx

	input_save.append(tvtrans.ToPILImage()(cx_show[0]))

	if self.disentanglement_noglobal:
	ci_glb = ci[:, 0:1]
	ci_loc = ci[:, 1: ]
	ci_loc = self.adjust_rank_f(ci_loc, fcs_lvl)
	ci = torch.cat([ci_glb, ci_loc], dim=1).repeat(n_samples, 1, 1)
	else:
	ci = self.adjust_rank_f(ci, fcs_lvl).repeat(n_samples, 1, 1)
	imc.append(ci * strength)

	cis = torch.cat(imc, dim=1)
	c_info_list.append({
	'type':'image',
	'conditioning':cis,
	'unconditional_conditioning':torch.zeros_like(cis),
	'unconditional_guidance_scale':scale,
	'ratio': (1-textstrength), })

	shape = [n_samples, self.image_latent_dim, h//8, w//8]
	np.random.seed(seed)
	torch.manual_seed(seed + 100)
	x, _ = sampler.sample_multicontext(
	steps=self.ddim_steps,
	x_info={'type':'image',},
	c_info_list=c_info_list,
	shape=shape,
	verbose=False,
	eta=self.ddim_eta)

	imout = self.net.vae_decode(x, which='image')
	imout = [tvtrans.ToPILImage()(i) for i in imout]
	return input_save, imout

	# vd_inference = vd_dummy()
	vd_inference = vd_inference(which='v1.0', fp16=True)

	#################
	# sub interface #
	#################

	def t2i_interface(with_example=False):
	gr.HTML('<p id=myinst>&nbsp Description: ' + get_instruction("Text-to-Image") + '</p>')
	with gr.Row():
	with gr.Column():
	text = gr.Textbox(lines=4, placeholder="Input prompt...", label='Text Input')
	seed = gr.Number(20, label="Seed", precision=0)
	button = gr.Button("Run")
	with gr.Column():
	img_output = gr.Gallery(label="Image Result", elem_id='customized_imbox')

	button.click(
	vd_inference.inference_t2i,
	inputs=[text, seed],
	outputs=[img_output])

	if with_example:
	gr.Examples(
	label='Examples',
	examples=get_example('Text-to-Image'),
	fn=vd_inference.inference_t2i,
	inputs=[text, seed],
	outputs=[img_output],
	cache_examples=cache_examples),

	def i2i_interface(with_example=False):
	gr.HTML('<p id=myinst>&nbsp Description: ' + get_instruction("Image-Variation") + '</p>')
	with gr.Row():
	with gr.Column():
	img_input = gr.Image(label='Image Input', type='pil', elem_id='customized_imbox')
	sim_flag = gr.Checkbox(label='Show Detail Controls')
	with gr.Row():
	fid_lvl = gr.Slider(label="Fidelity (Dislike -- Same)", minimum=0, maximum=1, value=0, step=0.02, visible=False)
	fcs_lvl = gr.Slider(label="Focus (Semantic -- Style)", minimum=0, maximum=1, value=0.5, step=0.02, visible=False)
	clr_adj = gr.Radio(label="Color Adjustment", choices=["None", "Simple"], value='Simple', visible=False)
	explain = gr.HTML('<p id=myinst>&nbsp Fidelity: How likely the output image looks like the referece image (0-dislike (default), 1-same).</p>'+
	'<p id=myinst>&nbsp Focus: What the output image should focused on (0-semantic, 0.5-balanced (default), 1-style).</p>',
	visible=False)
	seed = gr.Number(20, label="Seed", precision=0)
	button = gr.Button("Run")
	with gr.Column():
	img_output = gr.Gallery(label="Image Result", elem_id='customized_imbox')

	sim_flag.change(
	fn=lambda x: {
	explain : gr.update(visible=x),
	fid_lvl : gr.update(visible=x),
	fcs_lvl : gr.update(visible=x),
	clr_adj : gr.update(visible=x), },
	inputs=sim_flag,
	outputs=[explain, fid_lvl, fcs_lvl, clr_adj, seed],)

	button.click(
	vd_inference.inference_i2i,
	inputs=[img_input, fid_lvl, fcs_lvl, clr_adj, seed],
	outputs=[img_output])

	if with_example:
	gr.Examples(
	label='Examples',
	examples=get_example('Image-Variation'),
	fn=vd_inference.inference_i2i,
	inputs=[img_input, fid_lvl, fcs_lvl, clr_adj, seed],
	outputs=[img_output],
	cache_examples=cache_examples),

	def i2t_interface(with_example=False):
	gr.HTML('<p id=myinst>&nbsp Description: ' + get_instruction("Image-to-Text") + '</p>')
	with gr.Row():
	with gr.Column():
	img_input = gr.Image(label='Image Input', type='pil', elem_id='customized_imbox')
	seed = gr.Number(20, label="Seed", precision=0)
	button = gr.Button("Run")
	with gr.Column():
	txt_output = gr.Textbox(lines=4, label='Text Result')

	button.click(
	vd_inference.inference_i2t,
	inputs=[img_input, seed],
	outputs=[txt_output])

	if with_example:
	gr.Examples(
	label='Examples',
	examples=get_example('Image-to-Text'),
	fn=vd_inference.inference_i2t,
	inputs=[img_input, seed],
	outputs=[txt_output],
	cache_examples=cache_examples),

	def t2t_interface(with_example=False):
	gr.HTML('<p id=myinst>&nbsp Description: ' + get_instruction("Text-Variation") + '</p>')
	with gr.Row():
	with gr.Column():
	text = gr.Textbox(lines=4, placeholder="Input prompt...", label='Text Input')
	seed = gr.Number(20, label="Seed", precision=0)
	button = gr.Button("Run")
	with gr.Column():
	txt_output = gr.Textbox(lines=4, label='Text Result')

	button.click(
	vd_inference.inference_t2t,
	inputs=[text, seed],
	outputs=[txt_output])

	if with_example:
	gr.Examples(
	label='Examples',
	examples=get_example('Text-Variation'),
	fn=vd_inference.inference_t2t,
	inputs=[text, seed],
	outputs=[txt_output],
	cache_examples=cache_examples, )

	class image_mimage_swap(object):
	def __init__(self, block0, block1):
	self.block0 = block0
	self.block1 = block1
	self.which_update = 'both'

	def __call__(self, x0, x1, flag):
	if self.which_update == 'both':
	return self.update_both(x0, x1, flag)
	elif self.which_update == 'visible':
	return self.update_visible(x0, x1, flag)
	elif self.which_update == 'visible_oneoff':
	return self.update_visible_oneoff(x0, x1, flag)
	else:
	assert False

	def update_both(self, x0, x1, flag):
	if flag:
	ug0 = gr.update(visible=False)
	if x0 is None:
	ug1 = gr.update(value=None, visible=True)
	else:
	if (x1 is not None) and ('mask' in x1):
	value1 = {'image':x0, 'mask':x1['mask']}
	else:
	value1 = {'image':x0, 'mask':None}
	ug1 = gr.update(value=value1, visible=True)
	else:
	if (x1 is not None) and ('image' in x1):
	value0 = x1['image']
	else:
	value0 = None
	ug0 = gr.update(value=value0, visible=True)
	ug1 = gr.update(visible=False)
	return {
	self.block0 : ug0,
	self.block1 : ug1,}

	def update_visible(self, x0, x1, flag):
	return {
	self.block0 : gr.update(visible=not flag),
	self.block1 : gr.update(visible=flag), }

	def update_visible_oneoff(self, x0, x1, flag):
	self.which_update = 'both'
	return {
	self.block0 : gr.update(visible=not flag),
	self.block1 : gr.update(visible=flag), }

	class example_visible_only_hack(object):
	def __init__(self, checkbox_list, functor_list):
	self.checkbox_list = checkbox_list
	self.functor_list = functor_list

	def __call__(self, *args):
	for bi, fi, vi in zip(self.checkbox_list, self.functor_list, args):
	if bi.value != vi:
	fi.which_update = 'visible_oneoff'

	# def dcg_interface(with_example=False):
	# gr.HTML('<p id=myinst>&nbsp Description: ' + get_instruction("Dual-Context") + '</p>')
	# with gr.Row():
	# input_session = []
	# with gr.Column():
	# img = gr.Image(label='Image Input', type='pil', elem_id='customized_imbox')
	# fcs = gr.Slider(label="Focus (Semantic -- Style)", minimum=0, maximum=1, value=0.5, step=0.02)
	# gr.HTML('<p id=myinst>&nbsp Focus: Focus on what aspect of the image? (0-semantic, 0.5-balanced (default), 1-style).</p>')

	# text = gr.Textbox(lines=2, placeholder="Input prompt...", label='Text Input')
	# tstrength = gr.Slider(label="Text Domination (NoEffect -- TextOnly)", minimum=0, maximum=1, value=0, step=0.02)

	# seed = gr.Number(20, label="Seed", precision=0)
	# button = gr.Button("Run")

	# with gr.Column():
	# output_gallary = gr.Gallery(label="Image Result", elem_id='customized_imbox')

	# input_list = []
	# for i in input_session:
	# input_list += i
	# button.click(
	# vd_inference.inference_dcg,
	# inputs=[img, fcs, text, tstrength, seed],
	# outputs=[output_gallary])

	# if with_example:
	# gr.Examples(
	# label='Examples',
	# examples=get_example('Dual-Context'),
	# fn=vd_inference.inference_dcg,
	# inputs=[img, fcs, text, tstrength, seed],
	# outputs=[output_gallary],
	# cache_examples=cache_examples)

	# def tcg_interface(with_example=False):
	# gr.HTML('<p id=myinst>&nbsp Description: ' + get_instruction("Triple-Context") + '</p>')
	# with gr.Row():
	# input_session = []
	# with gr.Column(min_width=940):
	# with gr.Row():
	# with gr.Column():
	# img0 = gr.Image(label='Image Input', type='pil', elem_id='customized_imbox')
	# img0.as_example = types.MethodType(customized_as_example, img0)
	# imgm0 = gr.Image(label='Image Input with Mask', type='pil', elem_id='customized_imbox', tool='sketch', source="upload", visible=False)
	# imgm0.postprocess = types.MethodType(customized_postprocess, imgm0)
	# imgm0.as_example = types.MethodType(customized_as_example, imgm0)
	# istrength0 = gr.Slider(label="Weight", minimum=0, maximum=1, value=1, step=0.02)
	# fcs0 = gr.Slider(label="Focus (Semantic -- Style)", minimum=0, maximum=1, value=0.5, step=0.02)
	# msk0 = gr.Checkbox(label='Use mask?')
	# swapf0 = image_mimage_swap(img0, imgm0)

	# msk0.change(
	# fn=swapf0,
	# inputs=[img0, imgm0, msk0],
	# outputs=[img0, imgm0],)
	# input_session.append([img0, imgm0, istrength0, fcs0, msk0])

	# with gr.Column():
	# img1 = gr.Image(label='Image Input', type='pil', elem_id='customized_imbox')
	# img1.as_example = types.MethodType(customized_as_example, img1)
	# imgm1 = gr.Image(label='Image Input with Mask', type='pil', elem_id='customized_imbox', tool='sketch', source="upload", visible=False)
	# imgm1.postprocess = types.MethodType(customized_postprocess, imgm1)
	# imgm1.as_example = types.MethodType(customized_as_example, imgm1)
	# istrength1 = gr.Slider(label="Weight", minimum=0, maximum=1, value=1, step=0.02)
	# fcs1 = gr.Slider(label="Focus (Semantic -- Style)", minimum=0, maximum=1, value=0.5, step=0.02)
	# msk1 = gr.Checkbox(label='Use mask?')
	# swapf1 = image_mimage_swap(img1, imgm1)

	# msk1.change(
	# fn=swapf1,
	# inputs=[img1, imgm1, msk1],
	# outputs=[img1, imgm1],)
	# input_session.append([img1, imgm1, istrength1, fcs1, msk1])

	# gr.HTML('<p id=myinst>&nbsp Weight: The strength of the reference image. This weight is subject to <u>Text Domination</u>).</p>'+
	# '<p id=myinst>&nbsp Focus: Focus on what aspect of the image? (0-semantic, 0.5-balanced (default), 1-style).</p>'+
	# '<p id=myinst>&nbsp Mask: Remove regions on reference image so they will not influence the output.</p>',)

	# text = gr.Textbox(lines=2, placeholder="Input prompt...", label='Text Input')
	# tstrength = gr.Slider(label="Text Domination (NoEffect -- TextOnly)", minimum=0, maximum=1, value=0, step=0.02)

	# seed = gr.Number(20, label="Seed", precision=0)
	# button = gr.Button("Run")

	# with gr.Column(min_width=470):
	# input_gallary = gr.Gallery(label="Input Display", elem_id="customized_imbox")
	# output_gallary = gr.Gallery(label="Image Result", elem_id="customized_imbox")

	# input_list = []
	# for i in input_session:
	# input_list += i
	# input_list += [text, tstrength, seed]
	# button.click(
	# vd_inference.inference_tcg,
	# inputs=input_list,
	# outputs=[input_gallary, output_gallary])

	# if with_example:
	# create_myexamples(
	# label='Examples',
	# examples=get_example('Triple-Context'),
	# fn=vd_inference.inference_tcg,
	# inputs=input_list,
	# outputs=[input_gallary, output_gallary, ],
	# cache_examples=cache_examples, )

	# gr.HTML('<br><p id=myinst>&nbsp How to add mask: Please see the following instructions.</p><br>'+
	# '<div id="maskinst">'+
	# '<img src="file/assets/demo/misc/mask_inst1.gif">'+
	# '<img src="file/assets/demo/misc/mask_inst2.gif">'+
	# '<img src="file/assets/demo/misc/mask_inst3.gif">'+
	# '</div>')

	# def mcg_interface(with_example=False):
	# num_img_input = 4
	# gr.HTML('<p id=myinst>&nbsp Description: ' + get_instruction("Multi-Context") + '</p>')
	# with gr.Row():
	# input_session = []
	# with gr.Column():
	# for idx in range(num_img_input):
	# with gr.Tab('Image{}'.format(idx+1)):
	# img = gr.Image(label='Image Input', type='pil', elem_id='customized_imbox')
	# img.as_example = types.MethodType(customized_as_example, img)
	# imgm = gr.Image(label='Image Input with Mask', type='pil', elem_id='customized_imbox', tool='sketch', source="upload", visible=False)
	# imgm.postprocess = types.MethodType(customized_postprocess, imgm)
	# imgm.as_example = types.MethodType(customized_as_example, imgm)

	# with gr.Row():
	# istrength = gr.Slider(label="Weight", minimum=0, maximum=1, value=1, step=0.02)
	# fcs = gr.Slider(label="Focus (Semantic -- Style)", minimum=0, maximum=1, value=0.5, step=0.02)
	# msk = gr.Checkbox(label='Use mask?')
	# gr.HTML('<p id=myinst>&nbsp Weight: The strength of the reference image. This weight is subject to <u>Text Domination</u>).</p>'+
	# '<p id=myinst>&nbsp Focus: Focus on what aspect of the image? (0-semantic, 0.5-balanced (default), 1-style).</p>'+
	# '<p id=myinst>&nbsp Mask: Remove regions on reference image so they will not influence the output.</p>',)

	# msk.change(
	# fn=image_mimage_swap(img, imgm),
	# inputs=[img, imgm, msk],
	# outputs=[img, imgm],)
	# input_session.append([img, imgm, istrength, fcs, msk])

	# text = gr.Textbox(lines=2, placeholder="Input prompt...", label='Text Input')
	# tstrength = gr.Slider(label="Text Domination (NoEffect -- TextOnly)", minimum=0, maximum=1, value=0, step=0.02)

	# seed = gr.Number(20, label="Seed", precision=0)
	# button = gr.Button("Run")


	# with gr.Column():
	# input_gallary = gr.Gallery(label="Input Display", elem_id='customized_imbox')
	# output_gallary = gr.Gallery(label="Image Result", elem_id='customized_imbox')

	# input_list = []
	# for i in input_session:
	# input_list += i
	# input_list += [text, tstrength, seed]
	# button.click(
	# vd_inference.inference_mcg,
	# inputs=input_list,
	# outputs=[input_gallary, output_gallary], )

	# if with_example:
	# create_myexamples(
	# label='Examples',
	# examples=get_example('Multi-Context'),
	# fn=vd_inference.inference_mcg,
	# inputs=input_list,
	# outputs=[input_gallary, output_gallary],
	# cache_examples=cache_examples, )

	# gr.HTML('<br><p id=myinst>&nbsp How to add mask: Please see the following instructions.</p><br>'+
	# '<div id="maskinst">'+
	# '<img src="file/assets/demo/misc/mask_inst1.gif">'+
	# '<img src="file/assets/demo/misc/mask_inst2.gif">'+
	# '<img src="file/assets/demo/misc/mask_inst3.gif">'+
	# '</div>')

	###########
	# Example #
	###########

	def get_example(mode):
	if mode == 'Text-to-Image':
	case = [
	['a dream of a village in china, by Caspar David Friedrich, matte painting trending on artstation HQ', 23],
	['a beautiful landscape with mountains and rivers', 20],
	]
	elif mode == "Image-Variation":
	case = [
	['assets/demo/reg_example/ghibli.jpg', 0, 0.5, 'None', 20],
	['assets/demo/reg_example/ghibli.jpg', 0.5, 0.5, 'None', 20],
	['assets/demo/reg_example/matisse.jpg', 0, 0, 'None', 20],
	['assets/demo/reg_example/matisse.jpg', 0, 1, 'Simple', 20],
	['assets/demo/reg_example/vermeer.jpg', 0.2, 0.3, 'None', 30],
	]
	elif mode == "Image-to-Text":
	case = [
	['assets/demo/reg_example/house_by_lake.jpg', 20],
	]
	elif mode == "Text-Variation":
	case = [
	['heavy arms gundam penguin mech', 20],
	]
	elif mode == "Dual-Context":
	case = [
	['assets/demo/reg_example/benz.jpg', 0.5, 'cyberpunk 2077', 0.7, 22],
	['assets/demo/reg_example/ghibli.jpg', 1, 'Red maple on a hill in golden Autumn.', 0.5, 21],
	]
	elif mode == "Triple-Context":
	case = [
	[
	'assets/demo/reg_example/night_light.jpg', None, 1 , 0.5, False,
	'assets/demo/reg_example/paris.jpg' , None, 0.94, 0.5, False,
	"snow on the street", 0.4, 28],
	[
	'assets/demo/tcg_example/e1i0.jpg', None, 1 , 0.5, False,
	'assets/demo/tcg_example/e1i1.jpg', None, 0.94, 0.5, False,
	"a painting of an elegant woman in front of the moon", 0.2, 217],
	[
	'assets/demo/tcg_example/e2i0.jpg', None, 1, 0.5, False,
	'assets/demo/reg_example/paris.jpg', None, 1, 0.5, False,
	"", 0, 29],
	[
	'assets/demo/tcg_example/e0i0.jpg', None, 1 , 0.5, False,
	'assets/demo/tcg_example/e0i1.jpg', None, 0.9, 0.5, False,
	"rose blooms on the tree", 0.2, 20],
	[
	'assets/demo/reg_example/ghibli.jpg', None, 1 , 1 , False,
	'assets/demo/reg_example/space.jpg' , None, 0.88, 0.5, False,
	"", 0, 20],
	[
	'assets/demo/reg_example/train.jpg' , None, 0.8, 0.5, False,
	'assets/demo/reg_example/matisse.jpg', None, 1 , 1 , False,
	"", 0, 20],
	]
	elif mode == "Multi-Context":
	case = [
	[
	'assets/demo/mcg_example/e0i0.jpg', None, 1, 0.5, False,
	'assets/demo/mcg_example/e0i1.jpg', None, 1, 0.5, False,
	'assets/demo/mcg_example/e0i2.jpg', None, 0.86, 0.5, False,
	None, None, 1, 0.5, False,
	"", 0, 20],
	]
	else:
	raise ValueError
	return case

	#############
	# Interface #
	#############

	css = """
	#customized_imbox {
	min-height: 450px;
	}
	#customized_imbox>div[data-testid="image"] {
	min-height: 450px;
	}
	#customized_imbox>div[data-testid="image"]>div {
	min-height: 450px;
	}
	#customized_imbox>div[data-testid="image"]>iframe {
	min-height: 450px;
	}
	#customized_imbox>div.unpadded_box {
	min-height: 450px;
	}
	#myinst {
	font-size: 0.8rem;
	margin: 0rem;
	color: #6B7280;
	}
	#maskinst {
	text-align: justify;
	min-width: 1200px;
	}
	#maskinst>img {
	min-width:399px;
	max-width:450px;
	vertical-align: top;
	display: inline-block;
	}
	#maskinst:after {
	content: "";
	width: 100%;
	display: inline-block;
	}
	"""

	if True:
	with gr.Blocks(css=css) as demo:
	gr.HTML(
	"""
	<div style="text-align: center; max-width: 1200px; margin: 20px auto;">
	<h1 style="font-weight: 900; font-size: 3rem; margin: 0rem">
	Versatile Diffusion
	</h1>
	<h2 style="font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem">
	We built <b>Versatile Diffusion (VD), the first unified multi-flow multimodal diffusion framework</b>, as a step towards <b>Universal Generative AI</b>.
	VD can natively support image-to-text, image-variation, text-to-image, and text-variation,
	and can be further extended to other applications such as
	semantic-style disentanglement, image-text dual-guided generation, latent image-to-text-to-image editing, and more.
	Future versions will support more modalities such as speech, music, video and 3D.
	</h2>
	<h3 style="font-weight: 450; font-size: 1rem; margin: 0rem">
	Xingqian Xu, Atlas Wang, Eric Zhang, Kai Wang,
	and <a href="https://www.humphreyshi.com/home">Humphrey Shi</a>
	[<a href="https://arxiv.org/abs/2211.08332" style="color:blue;">arXiv</a>]
	[<a href="https://github.com/SHI-Labs/Versatile-Diffusion" style="color:blue;">GitHub</a>]
	</h3>
	</div>
	""")

	with gr.Tab('Text-to-Image'):
	t2i_interface(with_example=True)
	with gr.Tab('Image-Variation'):
	i2i_interface(with_example=True)
	with gr.Tab('Image-to-Text'):
	i2t_interface(with_example=True)
	with gr.Tab('Text-Variation'):
	t2t_interface(with_example=True)
	# with gr.Tab('Dual-Context Image-Generation'):
	# dcg_interface(with_example=True)
	# with gr.Tab('Triple-Context Image-Blender'):
	# tcg_interface(with_example=True)
	# with gr.Tab('Multi-Context Image-Blender'):
	# mcg_interface(with_example=True)

	gr.HTML(
	"""
	<div style="text-align: justify; max-width: 1200px; margin: 20px auto;">
	<h3 style="font-weight: 450; font-size: 0.8rem; margin: 0rem">
	<b>Version</b>: {}
	</h3>
	<h3 style="font-weight: 450; font-size: 0.8rem; margin: 0rem">
	<b>Caution</b>:
	We would like the raise the awareness of users of this demo of its potential issues and concerns.
	Like previous large foundation models, Versatile Diffusion could be problematic in some cases, partially due to the imperfect training data and pretrained network (VAEs / context encoders) with limited scope.
	In its future research phase, VD may do better on tasks such as text-to-image, image-to-text, etc., with the help of more powerful VAEs, more sophisticated network designs, and more cleaned data.
	So far, we keep all features available for research testing both to show the great potential of the VD framework and to collect important feedback to improve the model in the future.
	We welcome researchers and users to report issues with the HuggingFace community discussion feature or email the authors.
	</h3>
	<h3 style="font-weight: 450; font-size: 0.8rem; margin: 0rem">
	<b>Biases and content acknowledgement</b>:
	Beware that VD may output content that reinforces or exacerbates societal biases, as well as realistic faces, pornography, and violence.
	VD was trained on the LAION-2B dataset, which scraped non-curated online images and text, and may contained unintended exceptions as we removed illegal content.
	VD in this demo is meant only for research purposes.
	</h3>
	</div>
	""".format(' '+vd_inference.which))

	# demo.launch(share=True)
	demo.launch(debug=True)