TextTo3D / Image3DProcessor.py
Abdulrahman1989's picture
Fix the model path
28f286f
import sys
import os
os.system('bash setup.sh')
sys.path.append('/home/user/app/splatter-image')
sys.path.append('/home/user/app/diff-gaussian-rasterization')
import torch
import torchvision
import numpy as np
import imageio
from PIL import Image
import rembg
from omegaconf import OmegaConf
from huggingface_hub import hf_hub_download
from io import BytesIO
from utils.app_utils import (
remove_background,
resize_foreground,
set_white_background,
resize_to_128,
to_tensor,
get_source_camera_v2w_rmo_and_quats,
get_target_cameras,
export_to_obj
)
from scene.gaussian_predictor import GaussianSplatPredictor
from gaussian_renderer import render_predicted
class Image3DProcessor:
def __init__(self, model_cfg_path, model_repo_id, model_filename):
self.use_cuda = torch.cuda.is_available()
self.device = torch.device("cuda" if self.use_cuda else "cpu")
print("Image3DProcessor Device: ", self.device)
# Load model configuration
self.model_cfg = OmegaConf.load(model_cfg_path)
# Load pre-trained model weights
model_path = model_filename
self.model = GaussianSplatPredictor(self.model_cfg)
ckpt_loaded = torch.load(model_path, map_location=self.device)
self.model.load_state_dict(ckpt_loaded["model_state_dict"])
self.model.to(self.device)
self.model.eval()
@torch.no_grad()
def preprocess(self, input_image, preprocess_background=True, foreground_ratio=0.65):
# Create a new Rembg session
rembg_session = rembg.new_session()
# Convert bytes to a PIL image if necessary
if isinstance(input_image, bytes):
input_image = Image.open(BytesIO(input_image))
# Preprocess input image
if preprocess_background:
image = input_image.convert("RGB")
image = remove_background(image, rembg_session)
image = resize_foreground(image, foreground_ratio)
image = set_white_background(image)
else:
image = input_image
if image.mode == "RGBA":
image = set_white_background(image)
image = resize_to_128(image)
return image
@torch.no_grad()
def reconstruct_and_export(self, image):
"""
Passes image through model and outputs the reconstruction.
"""
image= np.array(image)
image_tensor = to_tensor(image).to(self.device)
view_to_world_source, rot_transform_quats = get_source_camera_v2w_rmo_and_quats()
view_to_world_source = view_to_world_source.to(self.device)
rot_transform_quats = rot_transform_quats.to(self.device)
reconstruction_unactivated = self.model(
image_tensor.unsqueeze(0).unsqueeze(0),
view_to_world_source,
rot_transform_quats,
None,
activate_output=False
)
reconstruction = {k: v[0].contiguous() for k, v in reconstruction_unactivated.items()}
reconstruction["scaling"] = self.model.scaling_activation(reconstruction["scaling"])
reconstruction["opacity"] = self.model.opacity_activation(reconstruction["opacity"])
# Render images in a loop
world_view_transforms, full_proj_transforms, camera_centers = get_target_cameras()
background = torch.tensor([1, 1, 1], dtype=torch.float32, device=self.device)
loop_renders = []
t_to_512 = torchvision.transforms.Resize(512, interpolation=torchvision.transforms.InterpolationMode.NEAREST)
for r_idx in range(world_view_transforms.shape[0]):
rendered_image = render_predicted(
reconstruction,
world_view_transforms[r_idx].to(self.device),
full_proj_transforms[r_idx].to(self.device),
camera_centers[r_idx].to(self.device),
background,
self.model_cfg,
focals_pixels=None
)["render"]
rendered_image = t_to_512(rendered_image)
loop_renders.append(torch.clamp(rendered_image * 255, 0.0, 255.0).detach().permute(1, 2, 0).cpu().numpy().astype(np.uint8))
# Save video to a file and load its content
video_path = "loop_.mp4"
imageio.mimsave(video_path, loop_renders, fps=25)
with open(video_path, "rb") as video_file:
video_data = video_file.read()
return video_data