Spaces:

Pie31415
/

rome

Build error

App Files Files Community

Pie31415 commited on Jan 6, 2023

Commit

4a38f47

1 Parent(s): 8160e04

updated app for video inference

Browse files

Files changed (1) hide show

app.py +39 -31

app.py CHANGED Viewed

@@ -138,34 +138,35 @@ def image_inference(
                                   out['render_masked'].cpu(), out['pred_target_shape_img'][0].cpu()], dim=2))
     return res[..., ::-1]
-def extract_frames(driver_vid):
-  image_frames = []
-  vid = cv2.VideoCapture(driver_vid) # path to mp4
-  while True:
-    success, img = vid.read()
-    if not success: break
-    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
-    pil_img = Image.fromarray(img)
-    image_frames.append(pil_img)
-  return image_frames
-def video_inference(source_img, driver_vid):
     image_frames = extract_frames(driver_vid)
     resulted_imgs = defaultdict(list)
-    video_folder = 'jenya_driver/'
-    image_frames = sorted(glob(f"{video_folder}/*", recursive=True), key=lambda x: int(x.split('/')[-1][:-4]))
     mask_hard_threshold = 0.5
-    N = len(image_frames)//20
-    for i in range(0, N, 4):
-        new_out = infer.evaluate(source_img, Image.open(image_frames[i]),
-                            source_information_for_reuse=out.get('source_information'))
         mask_pred = (new_out['pred_target_unet_mask'].cpu() > mask_hard_threshold).float()
         mask_pred = mask_errosion(mask_pred[0].float().numpy() * 255)
@@ -192,34 +193,41 @@ def video_inference(source_img, driver_vid):
         im.set_data(video[i,:,:,::-1])
         return im
-    anim = animation.FuncAnimation(fig, animate, init_func=init,
-                                frames=video.shape[0], interval=30)
-    return anim
 with gr.Blocks() as demo:
     gr.Markdown("# **<p align='center'>ROME: Realistic one-shot mesh-based head avatars</p>**")
     gr.Markdown(
         """
         <p style='text-align: center'>
         Create a personal avatar from just a single image using ROME.
         <br> <a href='https://arxiv.org/abs/2206.08343' target='_blank'>Paper</a> | <a href='https://samsunglabs.github.io/rome' target='_blank'>Project Page</a> | <a href='https://github.com/SamsungLabs/rome' target='_blank'>Github</a>
         </p>
         """
     )
     with gr.Tab("Image Inference"):
         with gr.Row():
-            source_img = gr.Image(type="pil", label="source image", show_label=True)
-            driver_img =  gr.Image(type="pil", label="driver image", show_label=True)
-        image_output = gr.Image()
         image_button = gr.Button("Predict")
     with gr.Tab("Video Inference"):
         with gr.Row():
             source_img2 = gr.Image(type="pil", label="source image", show_label=True)
             driver_vid = gr.Video(label="driver video")
-        video_output = gr.Image()
         video_button = gr.Button("Predict")
     gr.Examples(

                                   out['render_masked'].cpu(), out['pred_target_shape_img'][0].cpu()], dim=2))
     return res[..., ::-1]
+def extract_frames(
+    driver_vid: gr.inputs.Video = None
+):
+    image_frames = []
+    vid = cv2.VideoCapture(driver_vid) # path to mp4
+    while True:
+        success, img = vid.read()
+        if not success: break
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        pil_img = Image.fromarray(img)
+        image_frames.append(pil_img)
+    return image_frames
+def video_inference(
+    source_img: gr.inputs.Image = None,
+    driver_vid: gr.inputs.Video = None
+):
     image_frames = extract_frames(driver_vid)
     resulted_imgs = defaultdict(list)
     mask_hard_threshold = 0.5
+    N = len(image_frames)
+    for i in range(0, N, 4): # frame limits
+        new_out = infer.evaluate(source_img, image_frames[i])
         mask_pred = (new_out['pred_target_unet_mask'].cpu() > mask_hard_threshold).float()
         mask_pred = mask_errosion(mask_pred[0].float().numpy() * 255)
         im.set_data(video[i,:,:,::-1])
         return im
+    anim = animation.FuncAnimation(fig, animate, init_func=init, frames=video.shape[0], interval=30)
+    anim.save("avatar.gif", dpi=300, writer = animation.PillowWriter(fps=24))
+    return "avatar.gif"
 with gr.Blocks() as demo:
     gr.Markdown("# **<p align='center'>ROME: Realistic one-shot mesh-based head avatars</p>**")
     gr.Markdown(
         """
+        <img src='https://github.com/SamsungLabs/rome/blob/main/media/tease.gif'>
         <p style='text-align: center'>
         Create a personal avatar from just a single image using ROME.
         <br> <a href='https://arxiv.org/abs/2206.08343' target='_blank'>Paper</a> | <a href='https://samsunglabs.github.io/rome' target='_blank'>Project Page</a> | <a href='https://github.com/SamsungLabs/rome' target='_blank'>Github</a>
         </p>
+        <blockquote>
+            [The] system creates realistic mesh-based avatars from a single <strong>source</strong>
+            photo. These avatars are rigged, i.e., they can be driven by the animation parameters from a different <strong>driving</strong> frame.
+        </blockquote>
         """
     )
     with gr.Tab("Image Inference"):
         with gr.Row():
+            source_img = gr.Image(type="pil", label="Source image", show_label=True)
+            driver_img =  gr.Image(type="pil", label="Driver image", show_label=True)
+        image_output = gr.Image("Rendered avatar")
         image_button = gr.Button("Predict")
     with gr.Tab("Video Inference"):
         with gr.Row():
             source_img2 = gr.Image(type="pil", label="source image", show_label=True)
             driver_vid = gr.Video(label="driver video")
+        video_output = gr.Image(label="Rendered GIF avatar")
         video_button = gr.Button("Predict")
     gr.Examples(