Spaces:

haodongli
/

Lotus_Depth

Running on Zero

App Files Files Community

haodongli commited on Nov 13, 2024

Commit

a47128f

1 Parent(s): 4943e69

update for video depth

Browse files

Files changed (6) hide show

app.py +100 -39
files/videos/00.mp4 +3 -0
files/videos/01.mp4 +3 -0
infer.py +69 -79
pipeline.py +1 -1
utils/image_utils.py +5 -2

app.py CHANGED Viewed

@@ -31,18 +31,19 @@ def infer(path_input, seed):
     return [path_input, g_save_path], [path_input, d_save_path]
 def infer_video(path_input, seed):
-    frames_g, frames_d = lotus_video(path_input, 'depth', seed, device)
     if not os.path.exists("files/output"):
         os.makedirs("files/output")
     name_base, _ = os.path.splitext(os.path.basename(path_input))
     g_save_path = os.path.join("files/output", f"{name_base}_g.mp4")
     d_save_path = os.path.join("files/output", f"{name_base}_d.mp4")
-    imageio.mimsave(g_save_path, frames_g)
-    imageio.mimsave(d_save_path, frames_d)
     return [g_save_path, d_save_path]
 def run_demo_server():
     infer_gpu = spaces.GPU(functools.partial(infer))
     gradio_theme = gr.themes.Default()
     with gr.Blocks(
@@ -113,49 +114,96 @@ def run_demo_server():
         """
         )
         with gr.Tabs(elem_classes=["tabs"]):
-            with gr.Row():
-                with gr.Column():
-                    image_input = gr.Image(
-                        label="Input Image",
-                        type="filepath",
-                    )
-                    seed = gr.Number(
-                        label="Seed (only for Generative mode)",
-                        minimum=0,
-                        maximum=999999999,
-                    )
-                    with gr.Row():
-                        image_submit_btn = gr.Button(
-                            value="Predict Depth!", variant="primary"
                         )
-                        image_reset_btn = gr.Button(value="Reset")
-                with gr.Column():
-                    image_output_g = ImageSlider(
-                        label="Output (Generative)",
-                        type="filepath",
-                        interactive=False,
-                        elem_classes="slider",
-                        position=0.25,
-                    )
-                    with gr.Row():
-                        image_output_d = ImageSlider(
-                            label="Output (Discriminative)",
                             type="filepath",
                             interactive=False,
                             elem_classes="slider",
                             position=0.25,
                         )
-            gr.Examples(
-                fn=infer_gpu,
-                examples=sorted([
-                    [os.path.join("files", "images", name), 0]
-                    for name in os.listdir(os.path.join("files", "images"))
-                ]),
-                inputs=[image_input, seed],
-                outputs=[image_output_g, image_output_d],
-                cache_examples=False,
-            )
         ### Image
         image_submit_btn.click(
@@ -175,6 +223,19 @@ def run_demo_server():
             queue=False,
         )
         ### Server launch
         demo.queue(
             api_open=False,

     return [path_input, g_save_path], [path_input, d_save_path]
 def infer_video(path_input, seed):
+    frames_g, frames_d, fps = lotus_video(path_input, 'depth', seed, device)
     if not os.path.exists("files/output"):
         os.makedirs("files/output")
     name_base, _ = os.path.splitext(os.path.basename(path_input))
     g_save_path = os.path.join("files/output", f"{name_base}_g.mp4")
     d_save_path = os.path.join("files/output", f"{name_base}_d.mp4")
+    imageio.mimsave(g_save_path, frames_g, fps=fps)
+    imageio.mimsave(d_save_path, frames_d, fps=fps)
     return [g_save_path, d_save_path]
 def run_demo_server():
     infer_gpu = spaces.GPU(functools.partial(infer))
+    infer_video_gpu = spaces.GPU(functools.partial(infer_video))
     gradio_theme = gr.themes.Default()
     with gr.Blocks(
         """
         )
         with gr.Tabs(elem_classes=["tabs"]):
+            with gr.Tab("IMAGE"):
+                with gr.Row():
+                    with gr.Column():
+                        image_input = gr.Image(
+                            label="Input Image",
+                            type="filepath",
+                        )
+                        seed = gr.Number(
+                            label="Seed (only for Generative mode)",
+                            minimum=0,
+                            maximum=999999999,
                         )
+                        with gr.Row():
+                            image_submit_btn = gr.Button(
+                                value="Predict Depth!", variant="primary"
+                            )
+                            image_reset_btn = gr.Button(value="Reset")
+                    with gr.Column():
+                        image_output_g = ImageSlider(
+                            label="Output (Generative)",
                             type="filepath",
                             interactive=False,
                             elem_classes="slider",
                             position=0.25,
                         )
+                        with gr.Row():
+                            image_output_d = ImageSlider(
+                                label="Output (Discriminative)",
+                                type="filepath",
+                                interactive=False,
+                                elem_classes="slider",
+                                position=0.25,
+                            )
+                gr.Examples(
+                    fn=infer_gpu,
+                    examples=sorted([
+                        [os.path.join("files", "images", name), 0]
+                        for name in os.listdir(os.path.join("files", "images"))
+                    ]),
+                    inputs=[image_input, seed],
+                    outputs=[image_output_g, image_output_d],
+                    cache_examples=False,
+                )
+            with gr.Tab("VIDEO"):
+                with gr.Row():
+                    with gr.Column():
+                        input_video = gr.Video(
+                            label="Input Video",
+                            autoplay=True,
+                            loop=True,
+                        )
+                        seed = gr.Number(
+                            label="Seed (only for Generative mode)",
+                            minimum=0,
+                            maximum=999999999,
+                        )
+                        with gr.Row():
+                            video_submit_btn = gr.Button(
+                                value="Predict Depth!", variant="primary"
+                            )
+                            video_reset_btn = gr.Button(value="Reset")
+                    with gr.Column():
+                        video_output_g = gr.Video(
+                            label="Output (Generative)",
+                            interactive=False,
+                            autoplay=True,
+                            loop=True,
+                            show_share_button=True,
+                        )
+                        with gr.Row():
+                            video_output_d = gr.Video(
+                                label="Output (Discriminative)",
+                                interactive=False,
+                                autoplay=True,
+                                loop=True,
+                                show_share_button=True,
+                            )
+                gr.Examples(
+                    fn=infer_video_gpu,
+                    examples=sorted([
+                        [os.path.join("files", "videos", name), 0]
+                        for name in os.listdir(os.path.join("files", "videos"))
+                    ]),
+                    inputs=[input_video, seed],
+                    outputs=[video_output_g, video_output_d],
+                    cache_examples=False,
+                )
         ### Image
         image_submit_btn.click(
             queue=False,
         )
+        ### Video
+        video_submit_btn.click(
+            fn=infer_video_gpu,
+            inputs=[input_video, seed],
+            outputs=[video_output_g, video_output_d],
+            queue=True,
+        )
+        video_reset_btn.click(
+            fn=lambda: (None, None, None),
+            inputs=[],
+            outputs=[video_output_g, video_output_d],
+        )
         ### Server launch
         demo.queue(
             api_open=False,

files/videos/00.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ddb5e80168634ef46cdd5bb45178573a34001f147c7e96eb6220c09bfc0c4649
+size 3774878

files/videos/01.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9a532ba2738716dbb244e0d7172cf681879218cbbdad09980404fa08ef6b9ecc
+size 3095352

infer.py CHANGED Viewed

@@ -19,7 +19,7 @@ import cv2
 check_min_version('0.28.0.dev0')
-def infer_pipe(pipe, image_input, task_name, seed, device):
     if seed is None:
         generator = None
     else:
@@ -31,7 +31,8 @@ def infer_pipe(pipe, image_input, task_name, seed, device):
         autocast_ctx = torch.autocast(pipe.device.type)
     with autocast_ctx:
-        test_image = Image.open(image_input).convert('RGB')
         test_image = np.array(test_image).astype(np.float16)
         test_image = torch.tensor(test_image).permute(2,0,1).unsqueeze(0)
         test_image = test_image / 127.5 - 1.0
@@ -55,17 +56,57 @@ def infer_pipe(pipe, image_input, task_name, seed, device):
         # Post-process the prediction
         if task_name == 'depth':
             output_npy = pred.mean(axis=-1)
-            output_color = colorize_depth_map(output_npy)
         else:
             output_npy = pred
             output_color = Image.fromarray((output_npy * 255).astype(np.uint8))
     return output_color
-def lotus_video(input_video, task_name, seed, device):
     if task_name == 'depth':
-        model_g = 'jingheya/lotus-depth-g-v1-0'
-        model_d = 'jingheya/lotus-depth-d-v1-0'
     else:
         model_g = 'jingheya/lotus-normal-g-v1-0'
         model_d = 'jingheya/lotus-normal-d-v1-0'
@@ -84,9 +125,17 @@ def lotus_video(input_video, task_name, seed, device):
     pipe_g.set_progress_bar_config(disable=True)
     pipe_d.set_progress_bar_config(disable=True)
     logging.info(f"Successfully loading pipeline from {model_g} and {model_d}.")
     # load the video and split it into frames
     cap = cv2.VideoCapture(input_video)
     frames = []
     while True:
         ret, frame = cap.read()
@@ -94,91 +143,32 @@ def lotus_video(input_video, task_name, seed, device):
             break
         frames.append(frame)
     cap.release()
-    logging.info(f"There are {len(frames)} frames in the video.")
     if seed is None:
         generator = None
     else:
         generator = torch.Generator(device=device).manual_seed(seed)
-    task_emb = torch.tensor([1, 0]).float().unsqueeze(0).repeat(1, 1).to(device)
-    task_emb = torch.cat([torch.sin(task_emb), torch.cos(task_emb)], dim=-1).repeat(1, 1)
     output_g = []
     output_d = []
     for frame in frames:
-        if torch.backends.mps.is_available():
-            autocast_ctx = nullcontext()
-        else:
-            autocast_ctx = torch.autocast(pipe_g.device.type)
-        with autocast_ctx:
-            test_image = frame
-            test_image = np.array(test_image).astype(np.float16)
-            test_image = torch.tensor(test_image).permute(2,0,1).unsqueeze(0)
-            test_image = test_image / 127.5 - 1.0
-            test_image = test_image.to(device)
-            # Run
-            pred_g = pipe_g(
-                rgb_in=test_image,
-                prompt='',
-                num_inference_steps=1,
-                generator=generator,
-                # guidance_scale=0,
-                output_type='np',
-                timesteps=[999],
-                task_emb=task_emb,
-                ).images[0]
-            pred_d = pipe_d(
-                rgb_in=test_image,
-                prompt='',
-                num_inference_steps=1,
-                generator=generator,
-                # guidance_scale=0,
-                output_type='np',
-                timesteps=[999],
-                task_emb=task_emb,
-                ).images[0]
-            # Post-process the prediction
-            if task_name == 'depth':
-                output_npy_g = pred_g.mean(axis=-1)
-                output_color_g = colorize_depth_map(output_npy_g)
-                output_npy_d = pred_d.mean(axis=-1)
-                output_color_d = colorize_depth_map(output_npy_d)
-            else:
-                output_npy_g = pred_g
-                output_color_g = Image.fromarray((output_npy_g * 255).astype(np.uint8))
-                output_npy_d = pred_d
-                output_color_d = Image.fromarray((output_npy_d * 255).astype(np.uint8))
-            output_g.append(output_color_g)
-            output_d.append(output_color_d)
-    return output_g, output_d
 def lotus(image_input, task_name, seed, device):
-    if task_name == 'depth':
-        model_g = 'jingheya/lotus-depth-g-v1-0'
-        model_d = 'jingheya/lotus-depth-d-v1-1'
-    else:
-        model_g = 'jingheya/lotus-normal-g-v1-0'
-        model_d = 'jingheya/lotus-normal-d-v1-0'
-    dtype = torch.float16
-    pipe_g = LotusGPipeline.from_pretrained(
-        model_g,
-        torch_dtype=dtype,
-    )
-    pipe_d = LotusDPipeline.from_pretrained(
-        model_d,
-        torch_dtype=dtype,
-    )
-    pipe_g.to(device)
-    pipe_d.to(device)
-    pipe_g.set_progress_bar_config(disable=True)
-    pipe_d.set_progress_bar_config(disable=True)
-    logging.info(f"Successfully loading pipeline from {model_g} and {model_d}.")
     output_g = infer_pipe(pipe_g, image_input, task_name, seed, device)
     output_d = infer_pipe(pipe_d, image_input, task_name, seed, device)
     return output_g, output_d

 check_min_version('0.28.0.dev0')
+def infer_pipe(pipe, test_image, task_name, seed, device, video_depth=False):
     if seed is None:
         generator = None
     else:
         autocast_ctx = torch.autocast(pipe.device.type)
     with autocast_ctx:
+        if video_depth == False:
+            test_image = Image.open(test_image).convert('RGB')
         test_image = np.array(test_image).astype(np.float16)
         test_image = torch.tensor(test_image).permute(2,0,1).unsqueeze(0)
         test_image = test_image / 127.5 - 1.0
         # Post-process the prediction
         if task_name == 'depth':
             output_npy = pred.mean(axis=-1)
+            output_color = colorize_depth_map(output_npy, reverse_color=True)
         else:
             output_npy = pred
             output_color = Image.fromarray((output_npy * 255).astype(np.uint8))
     return output_color
+def infer_pipe_video(pipe, test_image, task_name, generator, device, latents=None):
+    if torch.backends.mps.is_available():
+        autocast_ctx = nullcontext()
+    else:
+        autocast_ctx = torch.autocast(pipe.device.type)
+    with autocast_ctx:
+        test_image = np.array(test_image).astype(np.float16)
+        test_image = torch.tensor(test_image).permute(2,0,1).unsqueeze(0)
+        test_image = test_image / 127.5 - 1.0
+        test_image = test_image.to(device)
+        task_emb = torch.tensor([1, 0]).float().unsqueeze(0).repeat(1, 1).to(device)
+        task_emb = torch.cat([torch.sin(task_emb), torch.cos(task_emb)], dim=-1).repeat(1, 1)
+        # Run
+        output = pipe(
+            rgb_in=test_image,
+            prompt='',
+            num_inference_steps=1,
+            generator=generator,
+            latents=latents,
+            # guidance_scale=0,
+            output_type='np',
+            timesteps=[999],
+            task_emb=task_emb,
+            return_dict=False
+            )
+        pred = output[0][0]
+        last_frame_latent = output[2]
+        # Post-process the prediction
+        if task_name == 'depth':
+            output_npy = pred.mean(axis=-1)
+            output_color = colorize_depth_map(output_npy, reverse_color=True)
+        else:
+            output_npy = pred
+            output_color = Image.fromarray((output_npy * 255).astype(np.uint8))
+    return output_color, last_frame_latent
+def load_pipe(task_name, device):
     if task_name == 'depth':
+        model_g = 'jingheya/lotus-depth-g-v2-0-disparity'
+        model_d = 'jingheya/lotus-depth-d-v2-0-disparity'
     else:
         model_g = 'jingheya/lotus-normal-g-v1-0'
         model_d = 'jingheya/lotus-normal-d-v1-0'
     pipe_g.set_progress_bar_config(disable=True)
     pipe_d.set_progress_bar_config(disable=True)
     logging.info(f"Successfully loading pipeline from {model_g} and {model_d}.")
+    return pipe_g, pipe_d
+def lotus_video(input_video, task_name, seed, device):
+    pipe_g, pipe_d = load_pipe(task_name, device)
     # load the video and split it into frames
     cap = cv2.VideoCapture(input_video)
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
     frames = []
     while True:
         ret, frame = cap.read()
             break
         frames.append(frame)
     cap.release()
+    # generate latents_common for lotus-g
     if seed is None:
         generator = None
     else:
         generator = torch.Generator(device=device).manual_seed(seed)
+    last_frame_latent = None
+    latent_common = torch.randn(
+        (1, 4, height // pipe_g.vae_scale_factor, width // pipe_g.vae_scale_factor), generator=generator, dtype=pipe_g.dtype, device=device
+    )
     output_g = []
     output_d = []
     for frame in frames:
+        latents = latent_common
+        if last_frame_latent is not None:
+            latents = 0.9 * latents + 0.1 * last_frame_latent
+        output_frame_g, last_frame_latent = infer_pipe_video(pipe_g, frame, task_name, seed, device, latents)
+        output_frame_d = infer_pipe(pipe_d, frame, task_name, seed, device, video_depth=True)
+        output_g.append(output_frame_g)
+        output_d.append(output_frame_d)
+    return output_g, output_d, fps
 def lotus(image_input, task_name, seed, device):
+    pipe_g, pipe_d = load_pipe(task_name, device)
     output_g = infer_pipe(pipe_g, image_input, task_name, seed, device)
     output_d = infer_pipe(pipe_d, image_input, task_name, seed, device)
     return output_g, output_d

pipeline.py CHANGED Viewed

@@ -1279,6 +1279,6 @@ class LotusGPipeline(DirectDiffusionPipeline):
         self.maybe_free_model_hooks()
         if not return_dict:
-            return (image, has_nsfw_concept)
         return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)

         self.maybe_free_model_hooks()
         if not return_dict:
+            return (image, has_nsfw_concept, latents)
         return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)

utils/image_utils.py CHANGED Viewed

@@ -44,12 +44,15 @@ def concatenate_images(*image_lists):
     return new_image
-def colorize_depth_map(depth, mask=None):
     cm = matplotlib.colormaps["Spectral"]
     # normalize
     depth = ((depth - depth.min()) / (depth.max() - depth.min()))
     # colorize
-    img_colored_np = cm(depth, bytes=False)[:, :, 0:3] # (h,w,3)
     depth_colored = (img_colored_np * 255).astype(np.uint8)
     if mask is not None:
         masked_image = np.zeros_like(depth_colored)

     return new_image
+def colorize_depth_map(depth, mask=None, reverse_color=False):
     cm = matplotlib.colormaps["Spectral"]
     # normalize
     depth = ((depth - depth.min()) / (depth.max() - depth.min()))
     # colorize
+    if reverse_color:
+        img_colored_np = cm(1 - depth, bytes=False)[:, :, 0:3]  # Invert the depth values before applying colormap
+    else:
+        img_colored_np = cm(depth, bytes=False)[:, :, 0:3] # (h,w,3)
     depth_colored = (img_colored_np * 255).astype(np.uint8)
     if mask is not None:
         masked_image = np.zeros_like(depth_colored)