Spaces:

ahujasherry18
/

object-dection-with-audio

Running

App Files Files Community

ahujasherry18 commited on 24 days ago

Commit

e14cc5c

verified ·

1 Parent(s): 211ec0e

Create app.py

Browse files

Files changed (1) hide show

app.py +111 -0

app.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import gradio as gr
+from PIL import Image, ImageDraw,ImageFont
+import scipy.io.wavfile as wavfile
+# Use a pipeline as a high-level helper
+from transformers import pipeline
+model_path = ("../Model/models--facebook--detr-resnet-50/snapshots"
+              "/1d5f47bd3bdd2c4bbfa585418ffe6da5028b4c0b")
+tts_model_path = ("../Model/models--kakao-enterprise--vits-ljs/snapshots"
+                  "/3bcb8321394f671bd948ebf0d086d694dda95464")
+narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
+object_detector = pipeline("object-detection", model="facebook/detr-resnet-50")
+# object_detector = pipeline("object-detection", model=model_path)
+# narrator = pipeline("text-to-speech", model=tts_model_path)
+def generate_audio(text):
+    narrated_text = narrator(text)
+    wavfile.write("finetuned_output.wav",
+                  rate=narrated_text["sampling_rate"],
+                  data=narrated_text["audio"][0])
+    return "finetuned_output.wav";
+def read_objects(detection_objects):
+    # Initialize counters for each object label
+    object_counts = {}
+    # Count the occurrences of each label
+    for detection in detection_objects:
+        label = detection['label']
+        if label in object_counts:
+            object_counts[label] += 1
+        else:
+            object_counts[label] = 1
+    # Generate the response string
+    response = "This picture contains"
+    labels = list(object_counts.keys())
+    for i, label in enumerate(labels):
+        response += f" {object_counts[label]} {label}"
+        if object_counts[label] > 1:
+            response += "s"
+        if i < len(labels) - 2:
+            response += ","
+        elif i == len(labels) - 2:
+            response += " and"
+    response += "."
+    return response
+def draw_bounding_boxes(image, detection_results):
+    """
+    Draws bounding boxes on the provided image based on the detection results.
+    Parameters:
+        image (PIL.Image): The input image to be annotated.
+        detection_results (list): A list of dictionaries, each containing the detected object details.
+    Returns:
+        PIL.Image: The image with bounding boxes drawn around the detected objects.
+    """
+    # Convert the input image to ImageDraw object to draw on it
+    draw = ImageDraw.Draw(image)
+    # Iterate through each detection result
+    for result in detection_results:
+        # Extract the bounding box coordinates and label
+        box = result['box']
+        label = result['label']
+        score = result['score']
+        # Define coordinates for the bounding box
+        xmin, ymin, xmax, ymax = box['xmin'], box['ymin'], box['xmax'], box['ymax']
+        # Draw the bounding box (with a red outline)
+        draw.rectangle([xmin, ymin, xmax, ymax], outline="red", width=3)
+        # Optionally, add label with score near the bounding box
+        text = f"{label} ({score * 100:.1f}%)"
+        draw.text((xmin, ymin - 10), text, fill="red")
+    return image
+def detect_objects(image):
+    raw_image = image
+    output = object_detector(raw_image)
+    processed_image = draw_bounding_boxes(raw_image, output)
+    naturalized_text = read_objects(output)
+    processed_audio = generate_audio(naturalized_text)
+    return processed_image, processed_audio
+demo = gr.Interface(fn = detect_objects,
+                    inputs=[gr.Image(label="Select Image",type="pil")],
+                    outputs=[gr.Image(label="Summarized Text ",type="pil"), gr.Audio(label="Generated Audio")],
+                    title="@SherryAhuja Project : Object Detection with Audio",
+                    description="This AI application will be used to Detect objects in an image and generate audio.",)
+demo.launch()