Video-Depth-Anything_RGBD

Running on Zero

App Files Files Community

Video-Depth-Anything_RGBD / app.py

Krokodilpirat

Update app.py

033fb37 verified 27 days ago

raw

history blame

8.64 kB

	import os
	import gc
	import torch
	import cv2
	import gradio as gr
	import numpy as np
	import matplotlib.cm as cm

	from video_depth_anything.video_depth import VideoDepthAnything
	from utils.dc_utils import read_video_frames, save_video
	from huggingface_hub import hf_hub_download

	# Examples for the Gradio Demo (the additional parameters: stitch, grayscale, blur are appended)
	examples = [
	['assets/example_videos/davis_rollercoaster.mp4', -1, -1, 1280, False, False, 0],
	['assets/example_videos/Tokyo-Walk_rgb.mp4', -1, -1, 1280, False, False, 0],
	['assets/example_videos/4158877-uhd_3840_2160_30fps_rgb.mp4', -1, -1, 1280, False, False, 0],
	['assets/example_videos/4511004-uhd_3840_2160_24fps_rgb.mp4', -1, -1, 1280, False, False, 0],
	['assets/example_videos/1753029-hd_1920_1080_30fps.mp4', -1, -1, 1280, False, False, 0],
	['assets/example_videos/davis_burnout.mp4', -1, -1, 1280, False, False, 0],
	['assets/example_videos/example_5473765-l.mp4', -1, -1, 1280, False, False, 0],
	['assets/example_videos/Istanbul-26920.mp4', -1, -1, 1280, False, False, 0],
	['assets/example_videos/obj_1.mp4', -1, -1, 1280, False, False, 0],
	['assets/example_videos/sheep_cut1.mp4', -1, -1, 1280, False, False, 0],
	]

	# Determine the device: use GPU if available, else CPU.
	DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

	# Model configuration for different encoder variants.
	model_configs = {
	'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
	'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
	}
	encoder2name = {
	'vits': 'Small',
	'vitl': 'Large',
	}
	encoder = 'vitl'
	model_name = encoder2name[encoder]

	# Initialize the model
	video_depth_anything = VideoDepthAnything(**model_configs[encoder])
	filepath = hf_hub_download(
	repo_id=f"depth-anything/Video-Depth-Anything-{model_name}",
	filename=f"video_depth_anything_{encoder}.pth",
	repo_type="model"
	)
	video_depth_anything.load_state_dict(torch.load(filepath, map_location='cpu'))
	video_depth_anything = video_depth_anything.to(DEVICE).eval()

	title = "# Video Depth Anything"
	description = """Official demo for Video Depth Anything.
	Please refer to our [paper](https://arxiv.org/abs/2501.12375), [project page](https://videodepthanything.github.io/), and [github](https://github.com/DepthAnything/Video-Depth-Anything) for more details."""

	def infer_video_depth(
	input_video: str,
	max_len: int = -1,
	target_fps: int = -1,
	max_res: int = 1280,
	stitch: bool = False,
	grayscale: bool = False,
	blur: float = 0.0,
	*, # The following parameters are keyword-only and cannot be overridden by UI input.
	output_dir: str = './outputs',
	input_size: int = 518,
	):
	# Read input video frames with the given maximum resolution (max_res) for inference.
	frames, target_fps = read_video_frames(input_video, max_len, target_fps, max_res)
	# Perform depth inference using the model.
	depths, fps = video_depth_anything.infer_video_depth(frames, target_fps, input_size=input_size, device=DEVICE)

	video_name = os.path.basename(input_video)
	if not os.path.exists(output_dir):
	os.makedirs(output_dir)

	# Save the preprocessed (RGB) video and the depth visualization (using the default color mapping)
	processed_video_path = os.path.join(output_dir, os.path.splitext(video_name)[0] + '_src.mp4')
	depth_vis_path = os.path.join(output_dir, os.path.splitext(video_name)[0] + '_vis.mp4')
	save_video(frames, processed_video_path, fps=fps)
	save_video(depths, depth_vis_path, fps=fps, is_depths=True)

	stitched_video_path = None
	if stitch:
	# For stitching: read the original video in full resolution (without downscaling)
	full_frames, _ = read_video_frames(input_video, max_len, target_fps, max_res=-1)
	# For each frame, create a visual depth image from the inferenced depth maps (which are in the downscaled resolution)
	d_min, d_max = depths.min(), depths.max()
	stitched_frames = []
	for i in range(min(len(full_frames), len(depths))):
	rgb_full = full_frames[i] # Full-resolution RGB frame
	depth_frame = depths[i]
	# Normalize the depth frame to the range [0, 255]
	depth_norm = ((depth_frame - d_min) / (d_max - d_min) * 255).astype(np.uint8)
	# Create either a grayscale image or apply the inferno colormap, depending on the setting.
	if grayscale:
	depth_vis = np.stack([depth_norm] * 3, axis=-1)
	else:
	cmap = cm.get_cmap("inferno")
	depth_vis = (cmap(depth_norm / 255.0)[..., :3] * 255).astype(np.uint8)
	# Apply Gaussian blur if requested (if blur factor > 0)
	if blur > 0:
	kernel_size = int(blur * 20) * 2 + 1 # ensures an odd kernel size
	depth_vis = cv2.GaussianBlur(depth_vis, (kernel_size, kernel_size), 0)
	# Resize the depth visual image to match the full-resolution RGB frame.
	H_full, W_full = rgb_full.shape[:2]
	depth_vis_resized = cv2.resize(depth_vis, (W_full, H_full))
	# Concatenate the full-resolution RGB frame (left) and the resized depth visual (right) side-by-side.
	stitched = cv2.hconcat([rgb_full, depth_vis_resized])
	stitched_frames.append(stitched)
	stitched_frames = np.array(stitched_frames)
	stitched_video_path = os.path.join(output_dir, os.path.splitext(video_name)[0] + '_stitched.mp4')
	save_video(stitched_frames, stitched_video_path, fps=fps)

	gc.collect()
	torch.cuda.empty_cache()

	# Return the processed RGB video, depth visualization, and (if created) the stitched video.
	return [processed_video_path, depth_vis_path, stitched_video_path]

	def construct_demo():
	with gr.Blocks(analytics_enabled=False) as demo:
	gr.Markdown(title)
	gr.Markdown(description)
	gr.Markdown("### If you find this work useful, please help ⭐ the [Github Repo](https://github.com/DepthAnything/Video-Depth-Anything). Thanks for your attention!")

	with gr.Row(equal_height=True):
	with gr.Column(scale=1):
	# Use the Video component for file upload (without specifying 'source')
	input_video = gr.Video(label="Input Video")
	with gr.Column(scale=2):
	with gr.Row(equal_height=True):
	processed_video = gr.Video(label="Preprocessed Video", interactive=False, autoplay=True, loop=True, show_share_button=True, scale=5)
	depth_vis_video = gr.Video(label="Generated Depth Video", interactive=False, autoplay=True, loop=True, show_share_button=True, scale=5)
	stitched_video = gr.Video(label="Stitched RGBD Video", interactive=False, autoplay=True, loop=True, show_share_button=True, scale=5)

	with gr.Row(equal_height=True):
	with gr.Column(scale=1):
	with gr.Accordion("Advanced Settings", open=False):
	max_len = gr.Slider(label="Max process length", minimum=-1, maximum=1000, value=500, step=1)
	target_fps = gr.Slider(label="Target FPS", minimum=-1, maximum=30, value=15, step=1)
	max_res = gr.Slider(label="Max side resolution", minimum=480, maximum=1920, value=1280, step=1)
	stitch_option = gr.Checkbox(label="Stitch RGB & Depth Videos", value=False)
	grayscale_option = gr.Checkbox(label="Output Depth as Grayscale", value=False)
	blur_slider = gr.Slider(minimum=0, maximum=1, step=0.01, label="Depth Blur Factor", value=0)
	generate_btn = gr.Button("Generate")
	with gr.Column(scale=2):
	pass

	gr.Examples(
	examples=examples,
	inputs=[input_video, max_len, target_fps, max_res, stitch_option, grayscale_option, blur_slider],
	outputs=[processed_video, depth_vis_video, stitched_video],
	fn=infer_video_depth,
	cache_examples=True,
	cache_mode="lazy",
	)

	generate_btn.click(
	fn=infer_video_depth,
	inputs=[input_video, max_len, target_fps, max_res, stitch_option, grayscale_option, blur_slider],
	outputs=[processed_video, depth_vis_video, stitched_video],
	)

	return demo

	if __name__ == "__main__":
	demo = construct_demo()
	demo.queue() # Enable asynchronous processing
	demo.launch(share=True)