Video-Depth-Anything_RGBD

Running on Zero

App Files Files Community

Krokodilpirat commited on 26 days ago

Commit

033fb37

verified ·

1 Parent(s): 4423b71

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -20

app.py CHANGED Viewed

@@ -10,8 +10,7 @@ from video_depth_anything.video_depth import VideoDepthAnything
 from utils.dc_utils import read_video_frames, save_video
 from huggingface_hub import hf_hub_download
-# Examples for the Gradio Demo
-# Hier wurden die zusätzlichen Parameter (stitch, grayscale, blur) mit Standardwerten ergänzt.
 examples = [
     ['assets/example_videos/davis_rollercoaster.mp4', -1, -1, 1280, False, False, 0],
     ['assets/example_videos/Tokyo-Walk_rgb.mp4', -1, -1, 1280, False, False, 0],
@@ -25,19 +24,18 @@ examples = [
     ['assets/example_videos/sheep_cut1.mp4', -1, -1, 1280, False, False, 0],
 ]
 DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
-# Model configuration
 model_configs = {
     'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
     'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
 }
 encoder2name = {
     'vits': 'Small',
     'vitl': 'Large',
 }
 encoder = 'vitl'
 model_name = encoder2name[encoder]
@@ -63,47 +61,52 @@ def infer_video_depth(
     stitch: bool = False,
     grayscale: bool = False,
     blur: float = 0.0,
-    *,  # Keyword-only parameters folgen ab hier:
     output_dir: str = './outputs',
     input_size: int = 518,
 ):
-    # Read input video frames
     frames, target_fps = read_video_frames(input_video, max_len, target_fps, max_res)
-    # Infer depth maps using the model
     depths, fps = video_depth_anything.infer_video_depth(frames, target_fps, input_size=input_size, device=DEVICE)
     video_name = os.path.basename(input_video)
     if not os.path.exists(output_dir):
         os.makedirs(output_dir)
-    # Save the processed (RGB) video and the depth visualization (using the default color mapping)
     processed_video_path = os.path.join(output_dir, os.path.splitext(video_name)[0] + '_src.mp4')
     depth_vis_path = os.path.join(output_dir, os.path.splitext(video_name)[0] + '_vis.mp4')
     save_video(frames, processed_video_path, fps=fps)
     save_video(depths, depth_vis_path, fps=fps, is_depths=True)
-    stitched_video_path = ""
     if stitch:
-        # Create a stitched video: left side is the RGB video, right side is the depth video
         d_min, d_max = depths.min(), depths.max()
         stitched_frames = []
-        for i in range(min(len(frames), len(depths))):
-            rgb_frame = frames[i]
             depth_frame = depths[i]
-            # Normalize the depth frame to [0, 255]
             depth_norm = ((depth_frame - d_min) / (d_max - d_min) * 255).astype(np.uint8)
-            # Use grayscale or colored mapping for the depth channel
             if grayscale:
                 depth_vis = np.stack([depth_norm] * 3, axis=-1)
             else:
                 cmap = cm.get_cmap("inferno")
                 depth_vis = (cmap(depth_norm / 255.0)[..., :3] * 255).astype(np.uint8)
-            # Apply Gaussian blur if requested (blur factor > 0)
             if blur > 0:
                 kernel_size = int(blur * 20) * 2 + 1  # ensures an odd kernel size
                 depth_vis = cv2.GaussianBlur(depth_vis, (kernel_size, kernel_size), 0)
-            # Concatenate side-by-side: RGB frame on the left, processed depth on the right
-            stitched = cv2.hconcat([rgb_frame, depth_vis])
             stitched_frames.append(stitched)
         stitched_frames = np.array(stitched_frames)
         stitched_video_path = os.path.join(output_dir, os.path.splitext(video_name)[0] + '_stitched.mp4')
@@ -112,7 +115,7 @@ def infer_video_depth(
     gc.collect()
     torch.cuda.empty_cache()
-    # Return processed RGB video, depth visualization, and (if created) stitched video.
     return [processed_video_path, depth_vis_path, stitched_video_path]
 def construct_demo():
@@ -123,7 +126,7 @@ def construct_demo():
         with gr.Row(equal_height=True):
             with gr.Column(scale=1):
-                # Hier verwenden wir den Video-Component ohne den 'source'-Parameter.
                 input_video = gr.Video(label="Input Video")
             with gr.Column(scale=2):
                 with gr.Row(equal_height=True):

 from utils.dc_utils import read_video_frames, save_video
 from huggingface_hub import hf_hub_download
+# Examples for the Gradio Demo (the additional parameters: stitch, grayscale, blur are appended)
 examples = [
     ['assets/example_videos/davis_rollercoaster.mp4', -1, -1, 1280, False, False, 0],
     ['assets/example_videos/Tokyo-Walk_rgb.mp4', -1, -1, 1280, False, False, 0],
     ['assets/example_videos/sheep_cut1.mp4', -1, -1, 1280, False, False, 0],
 ]
+# Determine the device: use GPU if available, else CPU.
 DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+# Model configuration for different encoder variants.
 model_configs = {
     'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
     'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
 }
 encoder2name = {
     'vits': 'Small',
     'vitl': 'Large',
 }
 encoder = 'vitl'
 model_name = encoder2name[encoder]
     stitch: bool = False,
     grayscale: bool = False,
     blur: float = 0.0,
+    *,  # The following parameters are keyword-only and cannot be overridden by UI input.
     output_dir: str = './outputs',
     input_size: int = 518,
 ):
+    # Read input video frames with the given maximum resolution (max_res) for inference.
     frames, target_fps = read_video_frames(input_video, max_len, target_fps, max_res)
+    # Perform depth inference using the model.
     depths, fps = video_depth_anything.infer_video_depth(frames, target_fps, input_size=input_size, device=DEVICE)
     video_name = os.path.basename(input_video)
     if not os.path.exists(output_dir):
         os.makedirs(output_dir)
+    # Save the preprocessed (RGB) video and the depth visualization (using the default color mapping)
     processed_video_path = os.path.join(output_dir, os.path.splitext(video_name)[0] + '_src.mp4')
     depth_vis_path = os.path.join(output_dir, os.path.splitext(video_name)[0] + '_vis.mp4')
     save_video(frames, processed_video_path, fps=fps)
     save_video(depths, depth_vis_path, fps=fps, is_depths=True)
+    stitched_video_path = None
     if stitch:
+        # For stitching: read the original video in full resolution (without downscaling)
+        full_frames, _ = read_video_frames(input_video, max_len, target_fps, max_res=-1)
+        # For each frame, create a visual depth image from the inferenced depth maps (which are in the downscaled resolution)
         d_min, d_max = depths.min(), depths.max()
         stitched_frames = []
+        for i in range(min(len(full_frames), len(depths))):
+            rgb_full = full_frames[i]  # Full-resolution RGB frame
             depth_frame = depths[i]
+            # Normalize the depth frame to the range [0, 255]
             depth_norm = ((depth_frame - d_min) / (d_max - d_min) * 255).astype(np.uint8)
+            # Create either a grayscale image or apply the inferno colormap, depending on the setting.
             if grayscale:
                 depth_vis = np.stack([depth_norm] * 3, axis=-1)
             else:
                 cmap = cm.get_cmap("inferno")
                 depth_vis = (cmap(depth_norm / 255.0)[..., :3] * 255).astype(np.uint8)
+            # Apply Gaussian blur if requested (if blur factor > 0)
             if blur > 0:
                 kernel_size = int(blur * 20) * 2 + 1  # ensures an odd kernel size
                 depth_vis = cv2.GaussianBlur(depth_vis, (kernel_size, kernel_size), 0)
+            # Resize the depth visual image to match the full-resolution RGB frame.
+            H_full, W_full = rgb_full.shape[:2]
+            depth_vis_resized = cv2.resize(depth_vis, (W_full, H_full))
+            # Concatenate the full-resolution RGB frame (left) and the resized depth visual (right) side-by-side.
+            stitched = cv2.hconcat([rgb_full, depth_vis_resized])
             stitched_frames.append(stitched)
         stitched_frames = np.array(stitched_frames)
         stitched_video_path = os.path.join(output_dir, os.path.splitext(video_name)[0] + '_stitched.mp4')
     gc.collect()
     torch.cuda.empty_cache()
+    # Return the processed RGB video, depth visualization, and (if created) the stitched video.
     return [processed_video_path, depth_vis_path, stitched_video_path]
 def construct_demo():
         with gr.Row(equal_height=True):
             with gr.Column(scale=1):
+                # Use the Video component for file upload (without specifying 'source')
                 input_video = gr.Video(label="Input Video")
             with gr.Column(scale=2):
                 with gr.Row(equal_height=True):