Update app.py
Browse files
app.py
CHANGED
@@ -1,34 +1,46 @@
|
|
1 |
import gradio as gr
|
2 |
-
from PIL import Image, ImageDraw,ImageFont
|
3 |
import scipy.io.wavfile as wavfile
|
4 |
|
5 |
|
6 |
# Use a pipeline as a high-level helper
|
7 |
from transformers import pipeline
|
8 |
|
9 |
-
model_path = ("../
|
10 |
-
|
|
|
|
|
|
|
11 |
|
12 |
-
tts_model_path = ("../Model/models--kakao-enterprise--vits-ljs/snapshots"
|
13 |
-
"/3bcb8321394f671bd948ebf0d086d694dda95464")
|
14 |
|
|
|
|
|
15 |
|
16 |
-
|
|
|
17 |
|
18 |
-
object_detector = pipeline("object-detection",
|
19 |
-
|
20 |
-
|
21 |
-
#
|
22 |
-
#
|
23 |
|
|
|
24 |
|
|
|
25 |
def generate_audio(text):
|
|
|
26 |
narrated_text = narrator(text)
|
27 |
-
|
28 |
-
|
|
|
29 |
data=narrated_text["audio"][0])
|
30 |
-
return "finetuned_output.wav";
|
31 |
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
|
34 |
def read_objects(detection_objects):
|
@@ -56,56 +68,79 @@ def read_objects(detection_objects):
|
|
56 |
response += " and"
|
57 |
|
58 |
response += "."
|
|
|
59 |
return response
|
60 |
|
61 |
|
62 |
|
63 |
-
def draw_bounding_boxes(image,
|
64 |
"""
|
65 |
-
Draws bounding boxes on the
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
|
|
73 |
"""
|
74 |
-
#
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
-
#
|
85 |
-
|
|
|
|
|
|
|
|
|
86 |
|
87 |
-
|
88 |
-
draw.
|
89 |
|
90 |
-
|
91 |
-
text = f"{label} ({score * 100:.1f}%)"
|
92 |
-
draw.text((xmin, ymin - 10), text, fill="red")
|
93 |
|
94 |
-
return image
|
95 |
|
96 |
-
def
|
97 |
raw_image = image
|
98 |
output = object_detector(raw_image)
|
99 |
processed_image = draw_bounding_boxes(raw_image, output)
|
100 |
-
|
101 |
-
processed_audio = generate_audio(
|
102 |
return processed_image, processed_audio
|
103 |
|
104 |
|
105 |
-
|
106 |
-
demo = gr.Interface(fn = detect_objects,
|
107 |
inputs=[gr.Image(label="Select Image",type="pil")],
|
108 |
-
outputs=[gr.Image(label="
|
109 |
-
title="@
|
110 |
-
description="
|
111 |
-
demo.launch()
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
+
from PIL import Image, ImageDraw, ImageFont
|
3 |
import scipy.io.wavfile as wavfile
|
4 |
|
5 |
|
6 |
# Use a pipeline as a high-level helper
|
7 |
from transformers import pipeline
|
8 |
|
9 |
+
# model_path = ("../Models/models--facebook--detr-resnet-50/snapshots"
|
10 |
+
# "/1d5f47bd3bdd2c4bbfa585418ffe6da5028b4c0b")
|
11 |
+
#
|
12 |
+
# tts_model_path = ("../Models/models--kakao-enterprise--vits-ljs/snapshots"
|
13 |
+
# "/3bcb8321394f671bd948ebf0d086d694dda95464")
|
14 |
|
|
|
|
|
15 |
|
16 |
+
narrator = pipeline("text-to-speech",
|
17 |
+
model="kakao-enterprise/vits-ljs")
|
18 |
|
19 |
+
object_detector = pipeline("object-detection",
|
20 |
+
model="facebook/detr-resnet-50")
|
21 |
|
22 |
+
# object_detector = pipeline("object-detection",
|
23 |
+
# model=model_path)
|
24 |
+
#
|
25 |
+
# narrator = pipeline("text-to-speech",
|
26 |
+
# model=tts_model_path)
|
27 |
|
28 |
+
# [{'score': 0.9996405839920044, 'label': 'person', 'box': {'xmin': 435, 'ymin': 282, 'xmax': 636, 'ymax': 927}}, {'score': 0.9995879530906677, 'label': 'dog', 'box': {'xmin': 570, 'ymin': 694, 'xmax': 833, 'ymax': 946}}]
|
29 |
|
30 |
+
# Define the function to generate audio from text
|
31 |
def generate_audio(text):
|
32 |
+
# Generate the narrated text
|
33 |
narrated_text = narrator(text)
|
34 |
+
|
35 |
+
# Save the audio to a WAV file
|
36 |
+
wavfile.write("output.wav", rate=narrated_text["sampling_rate"],
|
37 |
data=narrated_text["audio"][0])
|
|
|
38 |
|
39 |
+
# Return the path to the saved audio file
|
40 |
+
return "output.wav"
|
41 |
+
|
42 |
+
# Could you please write me a python code that will take list of detection object as an input and it will give the response that will include all the objects (labels) provided in the input. For example if the input is like this: [{'score': 0.9996405839920044, 'label': 'person', 'box': {'xmin': 435, 'ymin': 282, 'xmax': 636, 'ymax': 927}}, {'score': 0.9995879530906677, 'label': 'dog', 'box': {'xmin': 570, 'ymin': 694, 'xmax': 833, 'ymax': 946}}]
|
43 |
+
# The output should be, This pictuture contains 1 person and 1 dog. If there are multiple objects, do not add 'and' between every objects but 'and' should be at the end only
|
44 |
|
45 |
|
46 |
def read_objects(detection_objects):
|
|
|
68 |
response += " and"
|
69 |
|
70 |
response += "."
|
71 |
+
|
72 |
return response
|
73 |
|
74 |
|
75 |
|
76 |
+
def draw_bounding_boxes(image, detections, font_path=None, font_size=20):
|
77 |
"""
|
78 |
+
Draws bounding boxes on the given image based on the detections.
|
79 |
+
|
80 |
+
:param image: PIL.Image object
|
81 |
+
:param detections: List of detection results, where each result is a dictionary containing
|
82 |
+
'score', 'label', and 'box' keys. 'box' itself is a dictionary with 'xmin',
|
83 |
+
'ymin', 'xmax', 'ymax'.
|
84 |
+
:param font_path: Path to the TrueType font file to use for text.
|
85 |
+
:param font_size: Size of the font to use for text.
|
86 |
+
:return: PIL.Image object with bounding boxes drawn.
|
87 |
"""
|
88 |
+
# Make a copy of the image to draw on
|
89 |
+
draw_image = image.copy()
|
90 |
+
draw = ImageDraw.Draw(draw_image)
|
91 |
+
|
92 |
+
# Load custom font or default font if path not provided
|
93 |
+
if font_path:
|
94 |
+
font = ImageFont.truetype(font_path, font_size)
|
95 |
+
else:
|
96 |
+
# When font_path is not provided, load default font but it's size is fixed
|
97 |
+
font = ImageFont.load_default()
|
98 |
+
# Increase font size workaround by using a TTF font file, if needed, can download and specify the path
|
99 |
+
|
100 |
+
for detection in detections:
|
101 |
+
box = detection['box']
|
102 |
+
xmin = box['xmin']
|
103 |
+
ymin = box['ymin']
|
104 |
+
xmax = box['xmax']
|
105 |
+
ymax = box['ymax']
|
106 |
+
|
107 |
+
# Draw the bounding box
|
108 |
+
draw.rectangle([(xmin, ymin), (xmax, ymax)], outline="red", width=3)
|
109 |
+
|
110 |
+
# Optionally, you can also draw the label and score
|
111 |
+
label = detection['label']
|
112 |
+
score = detection['score']
|
113 |
+
text = f"{label} {score:.2f}"
|
114 |
|
115 |
+
# Draw text with background rectangle for visibility
|
116 |
+
if font_path: # Use the custom font with increased size
|
117 |
+
text_size = draw.textbbox((xmin, ymin), text, font=font)
|
118 |
+
else:
|
119 |
+
# Calculate text size using the default font
|
120 |
+
text_size = draw.textbbox((xmin, ymin), text)
|
121 |
|
122 |
+
draw.rectangle([(text_size[0], text_size[1]), (text_size[2], text_size[3])], fill="red")
|
123 |
+
draw.text((xmin, ymin), text, fill="white", font=font)
|
124 |
|
125 |
+
return draw_image
|
|
|
|
|
126 |
|
|
|
127 |
|
128 |
+
def detect_object(image):
|
129 |
raw_image = image
|
130 |
output = object_detector(raw_image)
|
131 |
processed_image = draw_bounding_boxes(raw_image, output)
|
132 |
+
natural_text = read_objects(output)
|
133 |
+
processed_audio = generate_audio(natural_text)
|
134 |
return processed_image, processed_audio
|
135 |
|
136 |
|
137 |
+
demo = gr.Interface(fn=detect_object,
|
|
|
138 |
inputs=[gr.Image(label="Select Image",type="pil")],
|
139 |
+
outputs=[gr.Image(label="Processed Image", type="pil"), gr.Audio(label="Generated Audio")],
|
140 |
+
title="@GenAILearniverse Project 7: Object Detector with Audio",
|
141 |
+
description="THIS APPLICATION WILL BE USED TO HIGHLIGHT OBJECTS AND GIVES AUDIO DESCRIPTION FOR THE PROVIDED INPUT IMAGE.")
|
142 |
+
demo.launch()
|
143 |
+
|
144 |
+
# print(output)
|
145 |
+
|
146 |
+
|