Spaces:

Dref360
/

human-interaction-demo

Running

App Files Files Community

Dref360 commited on Nov 23, 2024

Commit

fbb0b68

1 Parent(s): afb0729

First commit

Browse files

Files changed (3) hide show

.gitignore +162 -0
README.md +6 -6
app.py +170 -111

.gitignore ADDED Viewed

	@@ -0,0 +1,162 @@

+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+.idea/
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

README.md CHANGED Viewed

@@ -1,14 +1,14 @@
 ---
-title: Vit Pose Playground
-emoji: 🏆
-colorFrom: green
-colorTo: yellow
 sdk: gradio
-sdk_version: 5.5.0
 app_file: app.py
 pinned: false
 license: apache-2.0
-short_description: Small Space to test ViTPose
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Human Interaction Demo
+emoji: 📊
+colorFrom: gray
+colorTo: blue
 sdk: gradio
+sdk_version: 5.6.0
 app_file: app.py
 pinned: false
 license: apache-2.0
+short_description: Uses pose estimation to determine what are you touching.
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,50 +1,52 @@
 import gradio as gr
-import torch
 import numpy as np
-import cv2
-from PIL import Image
 import supervision as sv
 from transformers import (
     RTDetrForObjectDetection,
     RTDetrImageProcessor,
-    VitPoseConfig,
     VitPoseForPoseEstimation,
     VitPoseImageProcessor,
 )
-KEYPOINT_LABEL_MAP =     {
-        0: "Nose",
-        1: "L_Eye",
-        2: "R_Eye",
-        3: "L_Ear",
-        4: "R_Ear",
-        5: "L_Shoulder",
-        6: "R_Shoulder",
-        7: "L_Elbow",
-        8: "R_Elbow",
-        9: "L_Wrist",
-        10: "R_Wrist",
-        11: "L_Hip",
-        12: "R_Hip",
-        13: "L_Knee",
-        14: "R_Knee",
-        15: "L_Ankle",
-        16: "R_Ankle",
-    }
-class KeypointDetector:
     def __init__(self):
         self.person_detector = None
         self.person_processor = None
         self.pose_model = None
         self.pose_processor = None
         self.load_models()
     def load_models(self):
         """Load all required models"""
-        # Object detection model
         self.person_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")
         self.person_detector = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")
@@ -52,21 +54,35 @@ class KeypointDetector:
         self.pose_processor = VitPoseImageProcessor.from_pretrained("nielsr/vitpose-base-simple")
         self.pose_model = VitPoseForPoseEstimation.from_pretrained("nielsr/vitpose-base-simple")
-    @staticmethod
-    def pascal_voc_to_coco(bboxes: np.ndarray) -> np.ndarray:
-        """Convert Pascal VOC format to COCO format"""
-        bboxes = bboxes.copy()  # Create a copy to avoid modifying the input
-        bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
-        bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
-        return bboxes
-    @staticmethod
-    def coco_to_xyxy(bboxes: np.ndarray) -> np.ndarray:
-        """Convert COCO format (x,y,w,h) to xyxy format (x1,y1,x2,y2)"""
-        bboxes = bboxes.copy()
-        bboxes[:, 2] = bboxes[:, 0] + bboxes[:, 2]
-        bboxes[:, 3] = bboxes[:, 1] + bboxes[:, 3]
-        return bboxes
     def detect_persons(self, image: Image.Image):
         """Detect persons in the image"""
@@ -80,70 +96,105 @@ class KeypointDetector:
             threshold=0.3
         )
-        # Get boxes and scores for human class (index 0 in COCO dataset)
         boxes = results[0]["boxes"][results[0]["labels"] == 0]
         scores = results[0]["scores"][results[0]["labels"] == 0]
         return boxes.cpu().numpy(), scores.cpu().numpy()
     def detect_keypoints(self, image: Image.Image):
         """Detect keypoints in the image"""
-        # Detect persons first
         boxes, scores = self.detect_persons(image)
-        boxes_coco = [self.pascal_voc_to_coco(boxes)]
-        # Detect pose keypoints
-        pixel_values = self.pose_processor(image, boxes=boxes_coco, return_tensors="pt").pixel_values
         with torch.no_grad():
             outputs = self.pose_model(pixel_values)
-        pose_results = self.pose_processor.post_process_pose_estimation(outputs, boxes=boxes_coco)[0]
         return pose_results, boxes, scores
-    def visualize_detections(self, image: Image.Image, pose_results, boxes, scores):
-        """Visualize both bounding boxes and keypoints on the image"""
-        # Convert image to numpy array if needed
-        image_array = np.array(image)
-        # Setup detections for bounding boxes
-        detections = sv.Detections(
-            xyxy=self.coco_to_xyxy(boxes),
-            confidence=scores,
-            class_id=np.array([0]*len(scores))
-        )
-        # Create box annotator
-        box_annotator = sv.BoxAnnotator(
-            color=sv.ColorPalette.DEFAULT,
-            thickness=2
-        )
-        # Create edge annotator for keypoints
-        edge_annotator = sv.EdgeAnnotator(
-            color=sv.Color.GREEN,
-            thickness=3
-        )
-        # Convert keypoints to supervision format
         key_points = sv.KeyPoints(
             xy=torch.cat([pose_result['keypoints'].unsqueeze(0) for pose_result in pose_results]).cpu().numpy()
         )
-        # Annotate image with boxes first
-        annotated_frame = box_annotator.annotate(
-            scene=image_array.copy(),
-            detections=detections
-        )
-        # Then add keypoints
-        annotated_frame = edge_annotator.annotate(
-            scene=annotated_frame,
-            key_points=key_points
-        )
-        return Image.fromarray(annotated_frame)
     def process_image(self, input_image):
-        """Process image and return visualization"""
         if input_image is None:
             return None, ""
@@ -153,69 +204,77 @@ class KeypointDetector:
         else:
             image = input_image
-        # Detect keypoints and boxes
-        pose_results, boxes, scores = self.detect_keypoints(image)
         # Visualize results
-        result_image = self.visualize_detections(image, pose_results, boxes, scores)
-        # Create detection information text
         info_text = []
-        # Box information
-        for i, (box, score) in enumerate(zip(boxes, scores)):
-            info_text.append(f"\nPerson {i + 1} (confidence: {score:.2f})")
-            info_text.append(f"Bounding Box: x1={box[0]:.1f}, y1={box[1]:.1f}, x2={box[2]:.1f}, y2={box[3]:.1f}")
-            # Add keypoint information for this person
-            pose_result = pose_results[i]
-            for j, keypoint in enumerate(pose_result["keypoints"]):
-                x, y, confidence = keypoint
-                info_text.append(f"Keypoint {KEYPOINT_LABEL_MAP[j]}: x={x:.1f}, y={y:.1f}, confidence={confidence:.2f}")
-        return result_image, "\n".join(info_text)
 def create_gradio_interface():
     """Create Gradio interface"""
-    detector = KeypointDetector()
     with gr.Blocks() as interface:
-        gr.Markdown("# Human Detection and Keypoint Estimation using VitPose")
-        gr.Markdown("Upload an image to detect people and their keypoints. The model will:")
-        gr.Markdown("1. Detect people in the image (shown as bounding boxes)")
-        gr.Markdown("2. Identify keypoints for each detected person (shown as connected green lines)")
-        gr.Markdown("Huge shoutout to @NielsRogge and @SangbumChoi for this work!")
         with gr.Row():
             with gr.Column():
                 input_image = gr.Image(label="Input Image")
-                process_button = gr.Button("Detect People & Keypoints")
             with gr.Column():
                 output_image = gr.Image(label="Detection Results")
-                detection_info = gr.Textbox(
-                    label="Detection Information",
                     lines=10,
-                    placeholder="Detection details will appear here..."
                 )
         process_button.click(
             fn=detector.process_image,
             inputs=input_image,
-            outputs=[output_image, detection_info]
         )
         gr.Examples(
             examples=[
-                "http://images.cocodataset.org/val2017/000000000139.jpg"
             ],
             inputs=input_image
         )
     return interface
 if __name__ == "__main__":
-    interface = create_gradio_interface()
-    interface.launch()

+import cv2
 import gradio as gr
 import numpy as np
 import supervision as sv
+import torch
+from PIL import Image
 from transformers import (
     RTDetrForObjectDetection,
     RTDetrImageProcessor,
     VitPoseForPoseEstimation,
     VitPoseImageProcessor,
+    pipeline,
 )
+KEYPOINT_LABEL_MAP = {
+    0: "Nose",
+    1: "L_Eye",
+    2: "R_Eye",
+    3: "L_Ear",
+    4: "R_Ear",
+    5: "L_Shoulder",
+    6: "R_Shoulder",
+    7: "L_Elbow",
+    8: "R_Elbow",
+    9: "L_Wrist",
+    10: "R_Wrist",
+    11: "L_Hip",
+    12: "R_Hip",
+    13: "L_Knee",
+    14: "R_Knee",
+    15: "L_Ankle",
+    16: "R_Ankle",
+}
+class InteractionDetector:
     def __init__(self):
         self.person_detector = None
         self.person_processor = None
         self.pose_model = None
         self.pose_processor = None
+        self.depth_model = None
+        self.segmentation_model = None
+        self.interaction_threshold = 2
         self.load_models()
     def load_models(self):
         """Load all required models"""
+        # Person detection model
         self.person_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")
         self.person_detector = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")
         self.pose_processor = VitPoseImageProcessor.from_pretrained("nielsr/vitpose-base-simple")
         self.pose_model = VitPoseForPoseEstimation.from_pretrained("nielsr/vitpose-base-simple")
+        # Depth estimation model
+        self.depth_model = pipeline("depth-estimation", model="depth-anything/Depth-Anything-V2-Small-hf")
+        # Semantic segmentation model
+        self.segmentation_model = pipeline("image-segmentation", model="facebook/maskformer-swin-base-ade")
+        self.segmentation_id2label = self.segmentation_model.model.config.id2label
+        self.segmentation_label2id = {v: k for k, v in self.segmentation_model.model.config.id2label.items()}
+    def get_nearest_pixel_class(self, joint, depth_map, segmentation_map):
+        """
+        Find the nearest pixel of a specific class to a given joint coordinate
+        Args:
+            joint: (x, y) coordinates of the joint
+            depth_map: Depth map
+            segmentation_map: Semantic segmentation results
+        Returns:
+            tuple: class_name of nearest pixel, distance to that pixel
+        """
+        PERSON_ID = 12
+        grid_x, grid_y = np.meshgrid(np.arange(depth_map.shape[0]), np.arange(depth_map.shape[1]))
+        dist_x = np.abs(grid_x.T - joint[1])
+        dist_y = np.abs(grid_y.T - joint[0])
+        dist_coord = dist_x + dist_y
+        depth_dist = np.abs(depth_map - depth_map[joint[1], joint[0]])
+        depth_dist[(segmentation_map == PERSON_ID) | (dist_coord > 50)] = 255
+        min_dist = np.unravel_index(np.argmin(depth_dist), depth_dist.shape)
+        return segmentation_map[min_dist], depth_dist[min_dist]
     def detect_persons(self, image: Image.Image):
         """Detect persons in the image"""
             threshold=0.3
         )
         boxes = results[0]["boxes"][results[0]["labels"] == 0]
         scores = results[0]["scores"][results[0]["labels"] == 0]
         return boxes.cpu().numpy(), scores.cpu().numpy()
     def detect_keypoints(self, image: Image.Image):
         """Detect keypoints in the image"""
         boxes, scores = self.detect_persons(image)
+        pixel_values = self.pose_processor(image, boxes=[boxes], return_tensors="pt").pixel_values
         with torch.no_grad():
             outputs = self.pose_model(pixel_values)
+        pose_results = self.pose_processor.post_process_pose_estimation(outputs, boxes=[boxes])[0]
         return pose_results, boxes, scores
+    def estimate_depth(self, image: Image.Image):
+        """Estimate depth for the image"""
+        with torch.no_grad():
+            depth_map = np.array(self.depth_model(image)['depth'])
+        return depth_map
+    def segment_image(self, image: Image.Image):
+        """Perform semantic segmentation on the image"""
+        with torch.no_grad():
+            segmentation_map = self.segmentation_model(image)
+        result = np.zeros(np.array(image).shape[:2], dtype=np.uint8)
+        print("Found", [l['label'] for l in segmentation_map])
+        for cls_item in sorted(segmentation_map, key=lambda l: np.sum(l['mask']), reverse=True):
+            result[np.array(cls_item['mask']) > 0] = self.segmentation_label2id[cls_item['label']]
+        return result
+    def detect_wall_interaction(self, image: Image.Image):
+        """Detect if hands are touching walls"""
+        # Get all necessary information
+        pose_results, boxes, scores = self.detect_keypoints(image)
+        depth_map = self.estimate_depth(image)
+        segmentation_map = self.segment_image(image)
+        interactions = []
+        for person_idx, pose_result in enumerate(pose_results):
+            # Get hand keypoints
+            right_hand = pose_result["keypoints"][10].numpy().astype(int)
+            left_hand = pose_result["keypoints"][9].numpy().astype(int)
+            # Find nearest anything pixels
+            right_cls, r_distance = self.get_nearest_pixel_class(right_hand[:2], depth_map, segmentation_map)
+            left_cls, l_distance = self.get_nearest_pixel_class(left_hand[:2], depth_map, segmentation_map)
+            # Check for interactions
+            right_touching = r_distance < self.interaction_threshold
+            left_touching = l_distance < self.interaction_threshold
+            interactions.append({
+                "person_id": person_idx,
+                "right_hand_touching_object": self.segmentation_id2label[right_cls],
+                "left_hand_touching_object": self.segmentation_id2label[left_cls],
+                "right_hand_touching": right_touching,
+                "left_hand_touching": left_touching,
+                "right_hand_distance": r_distance,
+                "left_hand_distance": l_distance
+            })
+        return interactions, pose_results, segmentation_map, depth_map
+    def visualize_results(self, image: Image.Image, interactions, pose_results):
+        """Visualize detection results"""
+        # Create base visualization from original image
+        vis_image = np.array(image).copy()
+        # Add pose keypoints
+        edge_annotator = sv.EdgeAnnotator(color=sv.Color.GREEN, thickness=2)
         key_points = sv.KeyPoints(
             xy=torch.cat([pose_result['keypoints'].unsqueeze(0) for pose_result in pose_results]).cpu().numpy()
         )
+        vis_image = edge_annotator.annotate(scene=vis_image, key_points=key_points)
+        # Add interaction indicators
+        for interaction in interactions:
+            person_id = interaction["person_id"]
+            pose_result = pose_results[person_id]
+            # Draw indicators for touching hands
+            if interaction["right_hand_touching"]:
+                cv2.circle(vis_image,
+                           tuple(map(int, pose_result["keypoints"][10][:2])),
+                           10, (0, 0, 255), -1)
+            if interaction["left_hand_touching"]:
+                cv2.circle(vis_image,
+                           tuple(map(int, pose_result["keypoints"][9][:2])),
+                           10, (0, 0, 255), -1)
+        return Image.fromarray(vis_image)
     def process_image(self, input_image):
+        """Process image and return visualization with interaction detection"""
         if input_image is None:
             return None, ""
         else:
             image = input_image
+        image = image.resize((1280, 720))
+        # Detect interactions
+        interactions, pose_results, segmentation_map, depth_map = self.detect_wall_interaction(image)
         # Visualize results
+        result_image = self.visualize_results(image, interactions, pose_results)
+        # Create interaction information text
         info_text = []
+        for interaction in interactions:
+            info_text.append(f"\nPerson {interaction['person_id'] + 1}:")
+            if interaction["right_hand_touching"]:
+                info_text.append(f"Right hand is touching {interaction['right_hand_touching_object']}")
+            if interaction["left_hand_touching"]:
+                info_text.append(f"Left hand is touching {interaction['left_hand_touching_object']}")
+            info_text.append(f"Right hand distance to wall: {interaction['right_hand_distance']:.2f}")
+            info_text.append(f"Left hand distance to wall: {interaction['left_hand_distance']:.2f}")
+        # Add color to segmentation
+        mask = np.zeros((*segmentation_map.shape, 3), dtype=np.uint8)
+        colors = np.random.randint(0, 255, size=(100, 3))
+        for cl_id in np.unique(segmentation_map):
+            mask_array = np.array(segmentation_map == cl_id)
+            color = colors[cl_id % len(colors)]
+            mask[mask_array] = color
+        return result_image, mask, depth_map, "\n".join(info_text)
 def create_gradio_interface():
     """Create Gradio interface"""
+    detector = InteractionDetector()
     with gr.Blocks() as interface:
+        gr.Markdown("# Object Interaction Detection")
+        gr.Markdown("Upload an image to detect when people are touching objects.")
         with gr.Row():
             with gr.Column():
                 input_image = gr.Image(label="Input Image")
+                process_button = gr.Button("Detect Interactions")
             with gr.Column():
                 output_image = gr.Image(label="Detection Results")
+                interaction_info = gr.Textbox(
+                    label="Interaction Information",
                     lines=10,
+                    placeholder="Interaction details will appear here..."
                 )
+                segmentation_im = gr.Image(label="Segmentaiton Results")
+                depth_im = gr.Image(label="Depth Results")
         process_button.click(
             fn=detector.process_image,
             inputs=input_image,
+            outputs=[output_image, segmentation_im, depth_im, interaction_info]
         )
         gr.Examples(
             examples=[
+                "https://img.freepik.com/premium-photo/happy-black-man-opening-door-gesturing-okay-approving-new-home_116547-23954.jpg?w=1800",
+                "https://static3.bigstockphoto.com/6/7/2/large1500/276757975.jpg"
             ],
             inputs=input_image
         )
     return interface
+interface = create_gradio_interface()
 if __name__ == "__main__":
+    interface.launch(debug=True)