from math import inf import os import base64 import json import gradio as gr import numpy as np from gradio import processing_utils import requests from packaging import version from PIL import Image, ImageDraw import functools from langchain.llms.openai import OpenAI from caption_anything.model import CaptionAnything from caption_anything.utils.image_editing_utils import create_bubble_frame from caption_anything.utils.utils import mask_painter, seg_model_map, prepare_segmenter, image_resize from caption_anything.utils.parser import parse_augment from caption_anything.captioner import build_captioner from caption_anything.text_refiner import build_text_refiner from caption_anything.segmenter import build_segmenter from caption_anything.utils.chatbot import ConversationBot, build_chatbot_tools, get_new_image_name from segment_anything import sam_model_registry import easyocr import tts ############################################################################### ############# this part is for 3D generate ############# ############################################################################### # import spaces import os import imageio import numpy as np import torch import rembg from PIL import Image from torchvision.transforms import v2 from pytorch_lightning import seed_everything from omegaconf import OmegaConf from einops import rearrange, repeat from tqdm import tqdm from diffusers import DiffusionPipeline, EulerAncestralDiscreteScheduler from src.utils.train_util import instantiate_from_config from src.utils.camera_util import ( FOV_to_intrinsics, get_zero123plus_input_cameras, get_circular_camera_poses, ) from src.utils.mesh_util import save_obj, save_glb from src.utils.infer_util import remove_background, resize_foreground, images_to_video import tempfile from functools import partial from huggingface_hub import hf_hub_download def get_render_cameras(batch_size=1, M=120, radius=2.5, elevation=10.0, is_flexicubes=False): """ Get the rendering camera parameters. """ c2ws = get_circular_camera_poses(M=M, radius=radius, elevation=elevation) if is_flexicubes: cameras = torch.linalg.inv(c2ws) cameras = cameras.unsqueeze(0).repeat(batch_size, 1, 1, 1) else: extrinsics = c2ws.flatten(-2) intrinsics = FOV_to_intrinsics(50.0).unsqueeze(0).repeat(M, 1, 1).float().flatten(-2) cameras = torch.cat([extrinsics, intrinsics], dim=-1) cameras = cameras.unsqueeze(0).repeat(batch_size, 1, 1) return cameras def images_to_video(images, output_path, fps=30): # images: (N, C, H, W) os.makedirs(os.path.dirname(output_path), exist_ok=True) frames = [] for i in range(images.shape[0]): frame = (images[i].permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8).clip(0, 255) assert frame.shape[0] == images.shape[2] and frame.shape[1] == images.shape[3], \ f"Frame shape mismatch: {frame.shape} vs {images.shape}" assert frame.min() >= 0 and frame.max() <= 255, \ f"Frame value out of range: {frame.min()} ~ {frame.max()}" frames.append(frame) imageio.mimwrite(output_path, np.stack(frames), fps=fps, codec='h264') ############################################################################### # Configuration. ############################################################################### import shutil def find_cuda(): # Check if CUDA_HOME or CUDA_PATH environment variables are set cuda_home = os.environ.get('CUDA_HOME') or os.environ.get('CUDA_PATH') if cuda_home and os.path.exists(cuda_home): return cuda_home # Search for the nvcc executable in the system's PATH nvcc_path = shutil.which('nvcc') if nvcc_path: # Remove the 'bin/nvcc' part to get the CUDA installation path cuda_path = os.path.dirname(os.path.dirname(nvcc_path)) return cuda_path return None cuda_path = find_cuda() if cuda_path: print(f"CUDA installation found at: {cuda_path}") else: print("CUDA installation not found") config_path = 'configs/instant-nerf-base.yaml' config = OmegaConf.load(config_path) config_name = os.path.basename(config_path).replace('.yaml', '') model_config = config.model_config infer_config = config.infer_config IS_FLEXICUBES = True if config_name.startswith('instant-mesh') else False device = torch.device('cuda') # load diffusion model print('Loading diffusion model ...') pipeline = DiffusionPipeline.from_pretrained( "sudo-ai/zero123plus-v1.2", custom_pipeline="zero123plus", torch_dtype=torch.float16, ) pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config( pipeline.scheduler.config, timestep_spacing='trailing' ) # load custom white-background UNet unet_ckpt_path = hf_hub_download(repo_id="TencentARC/InstantMesh", filename="diffusion_pytorch_model.bin", repo_type="model") state_dict = torch.load(unet_ckpt_path, map_location='cpu') pipeline.unet.load_state_dict(state_dict, strict=True) pipeline = pipeline.to(device) # load reconstruction model print('Loading reconstruction model ...') model_ckpt_path = hf_hub_download(repo_id="TencentARC/InstantMesh", filename="instant_nerf_base.ckpt", repo_type="model") model0 = instantiate_from_config(model_config) state_dict = torch.load(model_ckpt_path, map_location='cpu')['state_dict'] state_dict = {k[14:]: v for k, v in state_dict.items() if k.startswith('lrm_generator.') and 'source_camera' not in k} model0.load_state_dict(state_dict, strict=True) model0 = model0.to(device) print('Loading Finished!') def check_input_image(input_image): if input_image is None: raise gr.Error("No image uploaded!") image = None else: image = Image.open(input_image) return image def preprocess(input_image, do_remove_background): rembg_session = rembg.new_session() if do_remove_background else None if do_remove_background: input_image = remove_background(input_image, rembg_session) input_image = resize_foreground(input_image, 0.85) return input_image # @spaces.GPU def generate_mvs(input_image, sample_steps, sample_seed): seed_everything(sample_seed) # sampling z123_image = pipeline( input_image, num_inference_steps=sample_steps ).images[0] show_image = np.asarray(z123_image, dtype=np.uint8) show_image = torch.from_numpy(show_image) # (960, 640, 3) show_image = rearrange(show_image, '(n h) (m w) c -> (n m) h w c', n=3, m=2) show_image = rearrange(show_image, '(n m) h w c -> (n h) (m w) c', n=2, m=3) show_image = Image.fromarray(show_image.numpy()) return z123_image, show_image # @spaces.GPU def make3d(images): global model0 if IS_FLEXICUBES: model0.init_flexicubes_geometry(device) model0 = model0.eval() images = np.asarray(images, dtype=np.float32) / 255.0 images = torch.from_numpy(images).permute(2, 0, 1).contiguous().float() # (3, 960, 640) images = rearrange(images, 'c (n h) (m w) -> (n m) c h w', n=3, m=2) # (6, 3, 320, 320) input_cameras = get_zero123plus_input_cameras(batch_size=1, radius=4.0).to(device) render_cameras = get_render_cameras(batch_size=1, radius=2.5, is_flexicubes=IS_FLEXICUBES).to(device) images = images.unsqueeze(0).to(device) images = v2.functional.resize(images, (320, 320), interpolation=3, antialias=True).clamp(0, 1) mesh_fpath = tempfile.NamedTemporaryFile(suffix=f".obj", delete=False).name print(mesh_fpath) mesh_basename = os.path.basename(mesh_fpath).split('.')[0] mesh_dirname = os.path.dirname(mesh_fpath) video_fpath = os.path.join(mesh_dirname, f"{mesh_basename}.mp4") mesh_glb_fpath = os.path.join(mesh_dirname, f"{mesh_basename}.glb") with torch.no_grad(): # get triplane planes = model0.forward_planes(images, input_cameras) # # get video # chunk_size = 20 if IS_FLEXICUBES else 1 # render_size = 384 # frames = [] # for i in tqdm(range(0, render_cameras.shape[1], chunk_size)): # if IS_FLEXICUBES: # frame = model.forward_geometry( # planes, # render_cameras[:, i:i+chunk_size], # render_size=render_size, # )['img'] # else: # frame = model.synthesizer( # planes, # cameras=render_cameras[:, i:i+chunk_size], # render_size=render_size, # )['images_rgb'] # frames.append(frame) # frames = torch.cat(frames, dim=1) # images_to_video( # frames[0], # video_fpath, # fps=30, # ) # print(f"Video saved to {video_fpath}") # get mesh mesh_out = model0.extract_mesh( planes, use_texture_map=False, **infer_config, ) vertices, faces, vertex_colors = mesh_out vertices = vertices[:, [1, 2, 0]] save_glb(vertices, faces, vertex_colors, mesh_glb_fpath) save_obj(vertices, faces, vertex_colors, mesh_fpath) print(f"Mesh saved to {mesh_fpath}") return mesh_fpath, mesh_glb_fpath ############################################################################### ############# above part is for 3D generate ############# ############################################################################### gpt_state = 0 pre_click_index=(inf, inf) article = """
By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml
Gradio demo for EyeSee Anything in Art, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. """ examples = [ ["test_images/img36.webp"], ["test_images/MUS.png"], ["test_images/图片2.png"], ["test_images/img5.jpg"], ["test_images/img14.jpg"], ["test_images/qingming3.jpeg"], ] with gr.Blocks( css=get_style() ) as iface: state = gr.State([]) out_state = gr.State(None) click_state = gr.State([[], [], []]) origin_image = gr.State(None) image_embedding = gr.State(None) text_refiner = gr.State(None) visual_chatgpt = gr.State(None) original_size = gr.State(None) input_size = gr.State(None) generated_caption = gr.State("") paragraph = gr.State("") aux_state = gr.State([]) click_index_state = gr.State((0, 0)) input_mask_state = gr.State(np.zeros((1, 1))) input_points_state = gr.State([]) input_labels_state = gr.State([]) new_crop_save_path = gr.State(None) image_input_nobackground = gr.State(None) gr.Markdown(title) gr.Markdown(description) with gr.Row(): with gr.Column(scale=1.0): with gr.Column(visible=False) as modules_not_need_gpt: with gr.Tab("Base(GPT Power)",visible=False) as base_tab: image_intro=gr.HTML() image_input_base = gr.Image(type="pil", interactive=True, elem_id="image_upload") example_image = gr.Image(type="pil", interactive=False, visible=False) with gr.Row(): name_label_base = gr.Button(value="Name: ") artist_label_base = gr.Button(value="Artist: ") year_label_base = gr.Button(value="Year: ") material_label_base = gr.Button(value="Material: ") with gr.Tab("Click") as click_tab: image_intro_click=gr.HTML() image_input = gr.Image(type="pil", interactive=True, elem_id="image_upload") example_image = gr.Image(type="pil", interactive=False, visible=False) with gr.Row(): name_label = gr.Button(value="Name: ") artist_label = gr.Button(value="Artist: ") year_label = gr.Button(value="Year: ") material_label = gr.Button(value="Material: ") with gr.Row(scale=1.0): focus_type = gr.Radio( choices=["CFV-D", "CFV-DA", "PFV-DA","PFV-DAI"], value="CFV-D", label="Focus Type", interactive=True) with gr.Row(scale=1.0): with gr.Row(scale=0.4): point_prompt = gr.Radio( choices=["Positive", "Negative"], value="Positive", label="Point Prompt", interactive=True) click_mode = gr.Radio( choices=["Continuous", "Single"], value="Continuous", label="Clicking Mode", interactive=True) with gr.Row(scale=0.4): clear_button_click = gr.Button(value="Clear Clicks", interactive=True) clear_button_image = gr.Button(value="Clear Image", interactive=True) submit_button_click=gr.Button(value="Submit", interactive=True) with gr.Tab("Trajectory (beta)"): sketcher_input = ImageSketcher(type="pil", interactive=True, brush_radius=20, elem_id="image_sketcher") with gr.Row(): submit_button_sketcher = gr.Button(value="Submit", interactive=True) with gr.Column(visible=False) as modules_need_gpt1: with gr.Row(scale=1.0): language = gr.Dropdown( ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"], value="English", label="Language", interactive=True) sentiment = gr.Radio( choices=["Positive", "Natural", "Negative"], value="Natural", label="Sentiment", interactive=True, ) with gr.Row(scale=1.0): factuality = gr.Radio( choices=["Factual", "Imagination"], value="Factual", label="Factuality", interactive=True, ) length = gr.Slider( minimum=10, maximum=80, value=10, step=1, interactive=True, label="Generated Caption Length", ) # 是否启用wiki内容整合到caption中 enable_wiki = gr.Radio( choices=["Yes", "No"], value="No", label="Enable Wiki", interactive=True) # with gr.Column(visible=True) as modules_not_need_gpt3: gr.Examples( examples=examples, inputs=[example_image], ) with gr.Column(scale=0.5): with gr.Column(visible=True) as module_key_input: openai_api_key = gr.Textbox( placeholder="Input openAI API key", show_label=False, label="OpenAI API Key", lines=1, type="password") with gr.Row(scale=0.5): enable_chatGPT_button = gr.Button(value="Run with ChatGPT", interactive=True, variant='primary') disable_chatGPT_button = gr.Button(value="Run without ChatGPT (Faster)", interactive=True, variant='primary') with gr.Column(visible=False) as module_notification_box: notification_box = gr.Textbox(lines=1, label="Notification", max_lines=5, show_label=False) with gr.Column(): with gr.Column(visible=False) as modules_need_gpt2: paragraph_output = gr.Textbox(lines=7, label="Describe Everything", max_lines=7) with gr.Column(visible=False) as modules_need_gpt0: cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True) with gr.Column(visible=False) as modules_not_need_gpt2: chatbot = gr.Chatbot(label="Chatbox", ).style(height=550, scale=0.5) with gr.Column(visible=False) as modules_need_gpt3: chat_input = gr.Textbox(show_label=False, placeholder="Enter text and press Enter").style( container=False) with gr.Row(): clear_button_text = gr.Button(value="Clear Text", interactive=True) submit_button_text = gr.Button(value="Submit", interactive=True, variant="primary") with gr.Row(): export_button = gr.Button(value="Export Chat Log", interactive=True, variant="primary") with gr.Row(): chat_log_file = gr.File(label="Download Chat Log") with gr.Column(scale=0.5): # TTS interface hidden initially with gr.Column(visible=False) as tts_interface: input_text = gr.Textbox(label="Text Prompt", value="Hello, World !, here is an example of light voice cloning. Try to upload your best audio samples quality") input_language = gr.Dropdown(label="Language", choices=["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn"], value="en") input_audio = gr.Audio(label="Reference Audio", type="filepath", value="examples/female.wav") input_mic = gr.Audio(source="microphone", type="filepath", label="Use Microphone for Reference") use_mic = gr.Checkbox(label="Check to use Microphone as Reference", value=False) agree = gr.Checkbox(label="Agree", value=True) output_waveform = gr.Video(label="Waveform Visual") output_audio = gr.HTML(label="Synthesised Audio") with gr.Row(): submit_tts = gr.Button(value="Submit", interactive=True) clear_tts = gr.Button(value="Clear", interactive=True) ############################################################################### # this part is for 3d generate. ############################################################################### with gr.Row(variant="panel"): with gr.Column(): with gr.Row(): input_image = gr.Image( label="Input Image", image_mode="RGBA", sources="upload", #width=256, #height=256, type="pil", elem_id="content_image", ) processed_image = gr.Image( label="Processed Image", image_mode="RGBA", #width=256, #height=256, type="pil", interactive=False ) with gr.Row(): with gr.Group(): do_remove_background = gr.Checkbox( label="Remove Background", value=True ) sample_seed = gr.Number(value=42, label="Seed Value", precision=0) sample_steps = gr.Slider( label="Sample Steps", minimum=30, maximum=75, value=75, step=5 ) with gr.Row(): submit = gr.Button("Generate", elem_id="generate", variant="primary") with gr.Row(variant="panel"): gr.Examples( examples=[ os.path.join("examples", img_name) for img_name in sorted(os.listdir("examples")) ], inputs=[input_image], label="Examples", cache_examples=False, examples_per_page=16 ) with gr.Column(): with gr.Row(): with gr.Column(): mv_show_images = gr.Image( label="Generated Multi-views", type="pil", width=379, interactive=False ) # with gr.Column(): # output_video = gr.Video( # label="video", format="mp4", # width=379, # autoplay=True, # interactive=False # ) with gr.Row(): with gr.Tab("OBJ"): output_model_obj = gr.Model3D( label="Output Model (OBJ Format)", interactive=False, ) gr.Markdown("Note: Downloaded .obj model will be flipped. Export .glb instead or manually flip it before usage.") with gr.Tab("GLB"): output_model_glb = gr.Model3D( label="Output Model (GLB Format)", interactive=False, ) gr.Markdown("Note: The model shown here has a darker appearance. Download to get correct results.") mv_images = gr.State() submit.click(fn=check_input_image, inputs=[new_crop_save_path], outputs=[processed_image]).success( fn=generate_mvs, inputs=[processed_image, sample_steps, sample_seed], outputs=[mv_images, mv_show_images] ).success( fn=make3d, inputs=[mv_images], outputs=[output_model_obj, output_model_glb] ) ############################################################################### # above part is for 3d generate. ############################################################################### def clear_tts_fields(): return [gr.update(value=""), gr.update(value=""), None, None, gr.update(value=False), gr.update(value=True), None, None] submit_tts.click( tts.predict, inputs=[input_text, input_language, input_audio, input_mic, use_mic, agree], outputs=[output_waveform, output_audio], queue=True ) clear_tts.click( clear_tts_fields, inputs=None, outputs=[input_text, input_language, input_audio, input_mic, use_mic, agree, output_waveform, output_audio], queue=False ) openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key], outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt, modules_not_need_gpt2, tts_interface,module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box]) enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key], outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt, modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box]) disable_chatGPT_button.click(init_wo_openai_api_key, outputs=[modules_need_gpt0, modules_need_gpt1, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt, modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box]) enable_chatGPT_button.click( lambda: (None, [], [], [[], [], []], "", "", ""), [], [image_input, chatbot, state, click_state, paragraph_output, origin_image], queue=False, show_progress=False ) openai_api_key.submit( lambda: (None, [], [], [[], [], []], "", "", ""), [], [image_input, chatbot, state, click_state, paragraph_output, origin_image], queue=False, show_progress=False ) cap_everything_button.click(cap_everything, [origin_image, visual_chatgpt, text_refiner,input_language, input_audio, input_mic, use_mic, agree], [paragraph_output,output_waveform, output_audio]) clear_button_click.click( lambda x: ([[], [], []], x), [origin_image], [click_state, image_input], queue=False, show_progress=False ) clear_button_click.click(functools.partial(clear_chat_memory, keep_global=True), inputs=[visual_chatgpt]) clear_button_image.click( lambda: (None, [], [], [[], [], []], "", "", ""), [], [image_input, chatbot, state, click_state, paragraph_output, origin_image], queue=False, show_progress=False ) clear_button_image.click(clear_chat_memory, inputs=[visual_chatgpt]) clear_button_text.click( lambda: ([], [], [[], [], [], []]), [], [chatbot, state, click_state], queue=False, show_progress=False ) clear_button_text.click(clear_chat_memory, inputs=[visual_chatgpt]) image_input.clear( lambda: (None, [], [], [[], [], []], "", "", ""), [], [image_input, chatbot, state, click_state, paragraph_output, origin_image], queue=False, show_progress=False ) image_input.clear(clear_chat_memory, inputs=[visual_chatgpt]) image_input_base.upload(upload_callback, [image_input_base, state, visual_chatgpt,openai_api_key], [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input, image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base,paragraph]) image_input.upload(upload_callback, [image_input, state, visual_chatgpt, openai_api_key], [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input, image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base,paragraph]) sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt, openai_api_key], [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input, image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base,paragraph]) chat_input.submit(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state], [chatbot, state, aux_state]) chat_input.submit(lambda: "", None, chat_input) submit_button_text.click(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state], [chatbot, state, aux_state]) submit_button_text.click(lambda: "", None, chat_input) example_image.change(upload_callback, [example_image, state, visual_chatgpt, openai_api_key], [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input, image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base,paragraph]) example_image.change(clear_chat_memory, inputs=[visual_chatgpt]) def on_click_tab_selected(): if gpt_state ==1: print(gpt_state) print("using gpt") return [gr.update(visible=True)]*2+[gr.update(visible=False)]*2 else: print("no gpt") print("gpt_state",gpt_state) return [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 def on_base_selected(): if gpt_state ==1: print(gpt_state) print("using gpt") return [gr.update(visible=True)]*2+[gr.update(visible=False)]*2 else: print("no gpt") return [gr.update(visible=False)]*4 click_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2]) base_tab.select(on_base_selected, outputs=[modules_need_gpt0,modules_need_gpt2,modules_not_need_gpt2,modules_need_gpt1]) image_input.select( inference_click, inputs=[ origin_image, point_prompt, click_mode, enable_wiki, language, sentiment, factuality, length, image_embedding, state, click_state, original_size, input_size, text_refiner, visual_chatgpt, out_state, click_index_state, input_mask_state, input_points_state, input_labels_state, ], outputs=[chatbot, state, click_state, image_input, input_image, generated_caption, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state,new_crop_save_path,image_input_nobackground], show_progress=False, queue=True ) submit_button_click.click( submit_caption, inputs=[ image_input, state, generated_caption, text_refiner, visual_chatgpt, enable_wiki, length, sentiment, factuality, language, out_state, click_index_state, input_mask_state, input_points_state, input_labels_state, input_text, input_language, input_audio, input_mic, use_mic, agree,paragraph,focus_type,openai_api_key,new_crop_save_path ], outputs=[ chatbot, state, image_input, click_index_state, input_mask_state, input_points_state, input_labels_state, out_state, output_waveform, output_audio ], show_progress=True, queue=True ) focus_type.change( lambda x: ([[], [], []], x), [image_input_nobackground], [click_state, image_input], queue=False, show_progress=False ) submit_button_sketcher.click( inference_traject, inputs=[ sketcher_input, enable_wiki, language, sentiment, factuality, length, image_embedding, state, original_size, input_size, text_refiner ], outputs=[chatbot, state, sketcher_input], show_progress=False, queue=True ) export_button.click( export_chat_log, inputs=[state], outputs=[chat_log_file], queue=True ) return iface if __name__ == '__main__': iface = create_ui() iface.queue(concurrency_count=5, api_open=False, max_size=10) iface.launch(server_name="0.0.0.0", enable_queue=True)