Spaces:
Running
on
Zero
Running
on
Zero
JianyuanWang
commited on
Commit
•
b19c7bf
1
Parent(s):
27a6ae5
solve
Browse files
app.py
CHANGED
@@ -42,7 +42,7 @@ def vggsfm_demo(
|
|
42 |
|
43 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
44 |
|
45 |
-
max_input_image =
|
46 |
|
47 |
target_dir = f"input_images_{timestamp}"
|
48 |
if os.path.exists(target_dir):
|
@@ -203,7 +203,7 @@ with gr.Blocks() as demo:
|
|
203 |
<li>upload the images (.jpg, .png, etc.), or </li>
|
204 |
<li>upload a video (.mp4, .mov, etc.) </li>
|
205 |
</ul>
|
206 |
-
<p>If both images and videos are uploaded, the demo will only reconstruct the uploaded images. By default, we extract <strong> 1 image frame per second from the input video </strong>. To prevent crashes on the Hugging Face space, we currently limit reconstruction to the first
|
207 |
<p>SfM methods are designed for <strong> rigid/static reconstruction </strong>. When dealing with dynamic/moving inputs, these methods may still work by focusing on the rigid parts of the scene. However, to ensure high-quality results, it is better to minimize the presence of moving objects in the input data. </p>
|
208 |
<p>The reconstruction should typically take <strong> up to 90 seconds </strong>. If it takes longer, the input data is likely not well-conditioned. </p>
|
209 |
<p>If you meet any problem, feel free to create an issue in our <a href="https://github.com/facebookresearch/vggsfm" target="_blank">GitHub Repo</a> ⭐</p>
|
@@ -245,6 +245,7 @@ with gr.Blocks() as demo:
|
|
245 |
cache_examples=True,
|
246 |
)
|
247 |
|
|
|
248 |
submit_btn.click(
|
249 |
vggsfm_demo,
|
250 |
[input_video, input_images, num_query_images, num_query_points],
|
|
|
42 |
|
43 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
44 |
|
45 |
+
max_input_image = 25
|
46 |
|
47 |
target_dir = f"input_images_{timestamp}"
|
48 |
if os.path.exists(target_dir):
|
|
|
203 |
<li>upload the images (.jpg, .png, etc.), or </li>
|
204 |
<li>upload a video (.mp4, .mov, etc.) </li>
|
205 |
</ul>
|
206 |
+
<p>If both images and videos are uploaded, the demo will only reconstruct the uploaded images. By default, we extract <strong> 1 image frame per second from the input video </strong>. To prevent crashes on the Hugging Face space, we currently limit reconstruction to the first 25 image frames. </p>
|
207 |
<p>SfM methods are designed for <strong> rigid/static reconstruction </strong>. When dealing with dynamic/moving inputs, these methods may still work by focusing on the rigid parts of the scene. However, to ensure high-quality results, it is better to minimize the presence of moving objects in the input data. </p>
|
208 |
<p>The reconstruction should typically take <strong> up to 90 seconds </strong>. If it takes longer, the input data is likely not well-conditioned. </p>
|
209 |
<p>If you meet any problem, feel free to create an issue in our <a href="https://github.com/facebookresearch/vggsfm" target="_blank">GitHub Repo</a> ⭐</p>
|
|
|
245 |
cache_examples=True,
|
246 |
)
|
247 |
|
248 |
+
|
249 |
submit_btn.click(
|
250 |
vggsfm_demo,
|
251 |
[input_video, input_images, num_query_images, num_query_points],
|
vggsfm_code/vggsfm/models/triangulator.py
CHANGED
@@ -323,7 +323,7 @@ class Triangulator(nn.Module):
|
|
323 |
# We adopt LORANSAC here again
|
324 |
|
325 |
best_triangulated_points, best_inlier_num, best_inlier_mask = triangulate_tracks(
|
326 |
-
extrinsics, tracks_normalized_refined, track_vis=pred_vis, track_score=pred_score
|
327 |
)
|
328 |
|
329 |
# Determine valid tracks based on inlier numbers
|
|
|
323 |
# We adopt LORANSAC here again
|
324 |
|
325 |
best_triangulated_points, best_inlier_num, best_inlier_mask = triangulate_tracks(
|
326 |
+
extrinsics, tracks_normalized_refined, track_vis=pred_vis, track_score=pred_score
|
327 |
)
|
328 |
|
329 |
# Determine valid tracks based on inlier numbers
|
vggsfm_code/vggsfm/utils/triangulation.py
CHANGED
@@ -755,7 +755,7 @@ def iterative_global_BA(
|
|
755 |
|
756 |
# triangulate tracks by LORANSAC
|
757 |
best_triangulated_points, best_inlier_num, best_inlier_mask = triangulate_tracks(
|
758 |
-
extrinsics, tracks_normalized_refined, track_vis=pred_vis, track_score=pred_score
|
759 |
)
|
760 |
|
761 |
best_triangulated_points[valid_tracks] = points3D_opt
|
|
|
755 |
|
756 |
# triangulate tracks by LORANSAC
|
757 |
best_triangulated_points, best_inlier_num, best_inlier_mask = triangulate_tracks(
|
758 |
+
extrinsics, tracks_normalized_refined, track_vis=pred_vis, track_score=pred_score
|
759 |
)
|
760 |
|
761 |
best_triangulated_points[valid_tracks] = points3D_opt
|
vggsfm_code/vggsfm/utils/triangulation_helpers.py
CHANGED
@@ -384,7 +384,7 @@ def generate_combinations(N):
|
|
384 |
return comb_array
|
385 |
|
386 |
|
387 |
-
def local_refinement_tri(points1, extrinsics, inlier_mask, sorted_indices, lo_num=50):
|
388 |
"""
|
389 |
Local Refinement for triangulation
|
390 |
"""
|
@@ -392,7 +392,6 @@ def local_refinement_tri(points1, extrinsics, inlier_mask, sorted_indices, lo_nu
|
|
392 |
batch_index = torch.arange(B).unsqueeze(-1).expand(-1, lo_num)
|
393 |
|
394 |
points1_expand = points1.unsqueeze(1).expand(-1, lo_num, -1, -1)
|
395 |
-
extrinsics_expand = extrinsics.unsqueeze(1).expand(-1, lo_num, -1, -1, -1)
|
396 |
|
397 |
# The sets selected for local refinement
|
398 |
lo_indices = sorted_indices[:, :lo_num]
|
@@ -402,18 +401,38 @@ def local_refinement_tri(points1, extrinsics, inlier_mask, sorted_indices, lo_nu
|
|
402 |
lo_points1 = torch.zeros_like(points1_expand)
|
403 |
lo_points1[lo_mask] = points1_expand[lo_mask]
|
404 |
|
405 |
-
lo_points1 = lo_points1.reshape(B * lo_num, N, -1)
|
406 |
-
lo_mask = lo_mask.reshape(B * lo_num, N)
|
407 |
-
lo_extrinsics = extrinsics_expand.reshape(B * lo_num, N, 3, 4)
|
408 |
-
|
409 |
-
# triangulate the inliers
|
410 |
-
triangulated_points, tri_angles, invalid_che_mask = triangulate_multi_view_point_batched(
|
411 |
-
lo_extrinsics, lo_points1, mask=lo_mask, compute_tri_angle=True, check_cheirality=True
|
412 |
-
)
|
413 |
|
414 |
-
triangulated_points = triangulated_points.reshape(B, lo_num, 3)
|
415 |
-
tri_angles = tri_angles.reshape(B, lo_num, -1)
|
416 |
|
417 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
418 |
|
419 |
return triangulated_points, tri_angles, invalid_che_mask
|
|
|
384 |
return comb_array
|
385 |
|
386 |
|
387 |
+
def local_refinement_tri(points1, extrinsics, inlier_mask, sorted_indices, lo_num=50, low_mem=True):
|
388 |
"""
|
389 |
Local Refinement for triangulation
|
390 |
"""
|
|
|
392 |
batch_index = torch.arange(B).unsqueeze(-1).expand(-1, lo_num)
|
393 |
|
394 |
points1_expand = points1.unsqueeze(1).expand(-1, lo_num, -1, -1)
|
|
|
395 |
|
396 |
# The sets selected for local refinement
|
397 |
lo_indices = sorted_indices[:, :lo_num]
|
|
|
401 |
lo_points1 = torch.zeros_like(points1_expand)
|
402 |
lo_points1[lo_mask] = points1_expand[lo_mask]
|
403 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
404 |
|
|
|
|
|
405 |
|
406 |
+
if low_mem:
|
407 |
+
all_triangulated_points = []
|
408 |
+
all_tri_angles = []
|
409 |
+
all_invalid_che_mask = []
|
410 |
+
|
411 |
+
for loidx in range(lo_num):
|
412 |
+
triangulated_points, tri_angles, invalid_che_mask = triangulate_multi_view_point_batched(
|
413 |
+
extrinsics, lo_points1[:, loidx], mask=lo_mask[:, loidx], compute_tri_angle=True, check_cheirality=True
|
414 |
+
)
|
415 |
+
# Append the outputs to the respective lists
|
416 |
+
all_triangulated_points.append(triangulated_points[:, None])
|
417 |
+
all_tri_angles.append(tri_angles[:, None])
|
418 |
+
all_invalid_che_mask.append(invalid_che_mask[:,None])
|
419 |
+
|
420 |
+
triangulated_points = torch.cat(all_triangulated_points, dim=1)
|
421 |
+
tri_angles = torch.cat(all_tri_angles, dim=1)
|
422 |
+
invalid_che_mask = torch.cat(all_invalid_che_mask, dim=1)
|
423 |
+
else:
|
424 |
+
extrinsics_expand = extrinsics.unsqueeze(1).expand(-1, lo_num, -1, -1, -1)
|
425 |
+
lo_points1 = lo_points1.reshape(B * lo_num, N, -1)
|
426 |
+
lo_mask = lo_mask.reshape(B * lo_num, N)
|
427 |
+
lo_extrinsics = extrinsics_expand.reshape(B * lo_num, N, 3, 4)
|
428 |
+
|
429 |
+
# triangulate the inliers
|
430 |
+
triangulated_points, tri_angles, invalid_che_mask = triangulate_multi_view_point_batched(
|
431 |
+
lo_extrinsics, lo_points1, mask=lo_mask, compute_tri_angle=True, check_cheirality=True
|
432 |
+
)
|
433 |
+
|
434 |
+
triangulated_points = triangulated_points.reshape(B, lo_num, 3)
|
435 |
+
tri_angles = tri_angles.reshape(B, lo_num, -1)
|
436 |
+
invalid_che_mask = invalid_che_mask.reshape(B, lo_num)
|
437 |
|
438 |
return triangulated_points, tri_angles, invalid_che_mask
|