ahujasherry18 commited on
Commit
e14cc5c
·
verified ·
1 Parent(s): 211ec0e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +111 -0
app.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from PIL import Image, ImageDraw,ImageFont
3
+ import scipy.io.wavfile as wavfile
4
+
5
+
6
+ # Use a pipeline as a high-level helper
7
+ from transformers import pipeline
8
+
9
+ model_path = ("../Model/models--facebook--detr-resnet-50/snapshots"
10
+ "/1d5f47bd3bdd2c4bbfa585418ffe6da5028b4c0b")
11
+
12
+ tts_model_path = ("../Model/models--kakao-enterprise--vits-ljs/snapshots"
13
+ "/3bcb8321394f671bd948ebf0d086d694dda95464")
14
+
15
+
16
+ narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
17
+
18
+ object_detector = pipeline("object-detection", model="facebook/detr-resnet-50")
19
+
20
+
21
+ # object_detector = pipeline("object-detection", model=model_path)
22
+ # narrator = pipeline("text-to-speech", model=tts_model_path)
23
+
24
+
25
+ def generate_audio(text):
26
+ narrated_text = narrator(text)
27
+ wavfile.write("finetuned_output.wav",
28
+ rate=narrated_text["sampling_rate"],
29
+ data=narrated_text["audio"][0])
30
+ return "finetuned_output.wav";
31
+
32
+
33
+
34
+ def read_objects(detection_objects):
35
+ # Initialize counters for each object label
36
+ object_counts = {}
37
+
38
+ # Count the occurrences of each label
39
+ for detection in detection_objects:
40
+ label = detection['label']
41
+ if label in object_counts:
42
+ object_counts[label] += 1
43
+ else:
44
+ object_counts[label] = 1
45
+
46
+ # Generate the response string
47
+ response = "This picture contains"
48
+ labels = list(object_counts.keys())
49
+ for i, label in enumerate(labels):
50
+ response += f" {object_counts[label]} {label}"
51
+ if object_counts[label] > 1:
52
+ response += "s"
53
+ if i < len(labels) - 2:
54
+ response += ","
55
+ elif i == len(labels) - 2:
56
+ response += " and"
57
+
58
+ response += "."
59
+ return response
60
+
61
+
62
+
63
+ def draw_bounding_boxes(image, detection_results):
64
+ """
65
+ Draws bounding boxes on the provided image based on the detection results.
66
+
67
+ Parameters:
68
+ image (PIL.Image): The input image to be annotated.
69
+ detection_results (list): A list of dictionaries, each containing the detected object details.
70
+
71
+ Returns:
72
+ PIL.Image: The image with bounding boxes drawn around the detected objects.
73
+ """
74
+ # Convert the input image to ImageDraw object to draw on it
75
+ draw = ImageDraw.Draw(image)
76
+
77
+ # Iterate through each detection result
78
+ for result in detection_results:
79
+ # Extract the bounding box coordinates and label
80
+ box = result['box']
81
+ label = result['label']
82
+ score = result['score']
83
+
84
+ # Define coordinates for the bounding box
85
+ xmin, ymin, xmax, ymax = box['xmin'], box['ymin'], box['xmax'], box['ymax']
86
+
87
+ # Draw the bounding box (with a red outline)
88
+ draw.rectangle([xmin, ymin, xmax, ymax], outline="red", width=3)
89
+
90
+ # Optionally, add label with score near the bounding box
91
+ text = f"{label} ({score * 100:.1f}%)"
92
+ draw.text((xmin, ymin - 10), text, fill="red")
93
+
94
+ return image
95
+
96
+ def detect_objects(image):
97
+ raw_image = image
98
+ output = object_detector(raw_image)
99
+ processed_image = draw_bounding_boxes(raw_image, output)
100
+ naturalized_text = read_objects(output)
101
+ processed_audio = generate_audio(naturalized_text)
102
+ return processed_image, processed_audio
103
+
104
+
105
+
106
+ demo = gr.Interface(fn = detect_objects,
107
+ inputs=[gr.Image(label="Select Image",type="pil")],
108
+ outputs=[gr.Image(label="Summarized Text ",type="pil"), gr.Audio(label="Generated Audio")],
109
+ title="@SherryAhuja Project : Object Detection with Audio",
110
+ description="This AI application will be used to Detect objects in an image and generate audio.",)
111
+ demo.launch()