ahujasherry18 commited on
Commit
bc993ef
·
verified ·
1 Parent(s): 51b3750

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -49
app.py CHANGED
@@ -1,34 +1,46 @@
1
  import gradio as gr
2
- from PIL import Image, ImageDraw,ImageFont
3
  import scipy.io.wavfile as wavfile
4
 
5
 
6
  # Use a pipeline as a high-level helper
7
  from transformers import pipeline
8
 
9
- model_path = ("../Model/models--facebook--detr-resnet-50/snapshots"
10
- "/1d5f47bd3bdd2c4bbfa585418ffe6da5028b4c0b")
 
 
 
11
 
12
- tts_model_path = ("../Model/models--kakao-enterprise--vits-ljs/snapshots"
13
- "/3bcb8321394f671bd948ebf0d086d694dda95464")
14
 
 
 
15
 
16
- narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
 
17
 
18
- object_detector = pipeline("object-detection", model="facebook/detr-resnet-50")
19
-
20
-
21
- # object_detector = pipeline("object-detection", model=model_path)
22
- # narrator = pipeline("text-to-speech", model=tts_model_path)
23
 
 
24
 
 
25
  def generate_audio(text):
 
26
  narrated_text = narrator(text)
27
- wavfile.write("finetuned_output.wav",
28
- rate=narrated_text["sampling_rate"],
 
29
  data=narrated_text["audio"][0])
30
- return "finetuned_output.wav";
31
 
 
 
 
 
 
32
 
33
 
34
  def read_objects(detection_objects):
@@ -56,56 +68,79 @@ def read_objects(detection_objects):
56
  response += " and"
57
 
58
  response += "."
 
59
  return response
60
 
61
 
62
 
63
- def draw_bounding_boxes(image, detection_results):
64
  """
65
- Draws bounding boxes on the provided image based on the detection results.
66
-
67
- Parameters:
68
- image (PIL.Image): The input image to be annotated.
69
- detection_results (list): A list of dictionaries, each containing the detected object details.
70
-
71
- Returns:
72
- PIL.Image: The image with bounding boxes drawn around the detected objects.
 
73
  """
74
- # Convert the input image to ImageDraw object to draw on it
75
- draw = ImageDraw.Draw(image)
76
-
77
- # Iterate through each detection result
78
- for result in detection_results:
79
- # Extract the bounding box coordinates and label
80
- box = result['box']
81
- label = result['label']
82
- score = result['score']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
- # Define coordinates for the bounding box
85
- xmin, ymin, xmax, ymax = box['xmin'], box['ymin'], box['xmax'], box['ymax']
 
 
 
 
86
 
87
- # Draw the bounding box (with a red outline)
88
- draw.rectangle([xmin, ymin, xmax, ymax], outline="red", width=3)
89
 
90
- # Optionally, add label with score near the bounding box
91
- text = f"{label} ({score * 100:.1f}%)"
92
- draw.text((xmin, ymin - 10), text, fill="red")
93
 
94
- return image
95
 
96
- def detect_objects(image):
97
  raw_image = image
98
  output = object_detector(raw_image)
99
  processed_image = draw_bounding_boxes(raw_image, output)
100
- naturalized_text = read_objects(output)
101
- processed_audio = generate_audio(naturalized_text)
102
  return processed_image, processed_audio
103
 
104
 
105
-
106
- demo = gr.Interface(fn = detect_objects,
107
  inputs=[gr.Image(label="Select Image",type="pil")],
108
- outputs=[gr.Image(label="Summarized Text ",type="pil"), gr.Audio(label="Generated Audio")],
109
- title="@SherryAhuja Project : Object Detection with Audio",
110
- description="This AI application will be used to Detect objects in an image and generate audio.",)
111
- demo.launch()
 
 
 
 
 
1
  import gradio as gr
2
+ from PIL import Image, ImageDraw, ImageFont
3
  import scipy.io.wavfile as wavfile
4
 
5
 
6
  # Use a pipeline as a high-level helper
7
  from transformers import pipeline
8
 
9
+ # model_path = ("../Models/models--facebook--detr-resnet-50/snapshots"
10
+ # "/1d5f47bd3bdd2c4bbfa585418ffe6da5028b4c0b")
11
+ #
12
+ # tts_model_path = ("../Models/models--kakao-enterprise--vits-ljs/snapshots"
13
+ # "/3bcb8321394f671bd948ebf0d086d694dda95464")
14
 
 
 
15
 
16
+ narrator = pipeline("text-to-speech",
17
+ model="kakao-enterprise/vits-ljs")
18
 
19
+ object_detector = pipeline("object-detection",
20
+ model="facebook/detr-resnet-50")
21
 
22
+ # object_detector = pipeline("object-detection",
23
+ # model=model_path)
24
+ #
25
+ # narrator = pipeline("text-to-speech",
26
+ # model=tts_model_path)
27
 
28
+ # [{'score': 0.9996405839920044, 'label': 'person', 'box': {'xmin': 435, 'ymin': 282, 'xmax': 636, 'ymax': 927}}, {'score': 0.9995879530906677, 'label': 'dog', 'box': {'xmin': 570, 'ymin': 694, 'xmax': 833, 'ymax': 946}}]
29
 
30
+ # Define the function to generate audio from text
31
  def generate_audio(text):
32
+ # Generate the narrated text
33
  narrated_text = narrator(text)
34
+
35
+ # Save the audio to a WAV file
36
+ wavfile.write("output.wav", rate=narrated_text["sampling_rate"],
37
  data=narrated_text["audio"][0])
 
38
 
39
+ # Return the path to the saved audio file
40
+ return "output.wav"
41
+
42
+ # Could you please write me a python code that will take list of detection object as an input and it will give the response that will include all the objects (labels) provided in the input. For example if the input is like this: [{'score': 0.9996405839920044, 'label': 'person', 'box': {'xmin': 435, 'ymin': 282, 'xmax': 636, 'ymax': 927}}, {'score': 0.9995879530906677, 'label': 'dog', 'box': {'xmin': 570, 'ymin': 694, 'xmax': 833, 'ymax': 946}}]
43
+ # The output should be, This pictuture contains 1 person and 1 dog. If there are multiple objects, do not add 'and' between every objects but 'and' should be at the end only
44
 
45
 
46
  def read_objects(detection_objects):
 
68
  response += " and"
69
 
70
  response += "."
71
+
72
  return response
73
 
74
 
75
 
76
+ def draw_bounding_boxes(image, detections, font_path=None, font_size=20):
77
  """
78
+ Draws bounding boxes on the given image based on the detections.
79
+
80
+ :param image: PIL.Image object
81
+ :param detections: List of detection results, where each result is a dictionary containing
82
+ 'score', 'label', and 'box' keys. 'box' itself is a dictionary with 'xmin',
83
+ 'ymin', 'xmax', 'ymax'.
84
+ :param font_path: Path to the TrueType font file to use for text.
85
+ :param font_size: Size of the font to use for text.
86
+ :return: PIL.Image object with bounding boxes drawn.
87
  """
88
+ # Make a copy of the image to draw on
89
+ draw_image = image.copy()
90
+ draw = ImageDraw.Draw(draw_image)
91
+
92
+ # Load custom font or default font if path not provided
93
+ if font_path:
94
+ font = ImageFont.truetype(font_path, font_size)
95
+ else:
96
+ # When font_path is not provided, load default font but it's size is fixed
97
+ font = ImageFont.load_default()
98
+ # Increase font size workaround by using a TTF font file, if needed, can download and specify the path
99
+
100
+ for detection in detections:
101
+ box = detection['box']
102
+ xmin = box['xmin']
103
+ ymin = box['ymin']
104
+ xmax = box['xmax']
105
+ ymax = box['ymax']
106
+
107
+ # Draw the bounding box
108
+ draw.rectangle([(xmin, ymin), (xmax, ymax)], outline="red", width=3)
109
+
110
+ # Optionally, you can also draw the label and score
111
+ label = detection['label']
112
+ score = detection['score']
113
+ text = f"{label} {score:.2f}"
114
 
115
+ # Draw text with background rectangle for visibility
116
+ if font_path: # Use the custom font with increased size
117
+ text_size = draw.textbbox((xmin, ymin), text, font=font)
118
+ else:
119
+ # Calculate text size using the default font
120
+ text_size = draw.textbbox((xmin, ymin), text)
121
 
122
+ draw.rectangle([(text_size[0], text_size[1]), (text_size[2], text_size[3])], fill="red")
123
+ draw.text((xmin, ymin), text, fill="white", font=font)
124
 
125
+ return draw_image
 
 
126
 
 
127
 
128
+ def detect_object(image):
129
  raw_image = image
130
  output = object_detector(raw_image)
131
  processed_image = draw_bounding_boxes(raw_image, output)
132
+ natural_text = read_objects(output)
133
+ processed_audio = generate_audio(natural_text)
134
  return processed_image, processed_audio
135
 
136
 
137
+ demo = gr.Interface(fn=detect_object,
 
138
  inputs=[gr.Image(label="Select Image",type="pil")],
139
+ outputs=[gr.Image(label="Processed Image", type="pil"), gr.Audio(label="Generated Audio")],
140
+ title="@GenAILearniverse Project 7: Object Detector with Audio",
141
+ description="THIS APPLICATION WILL BE USED TO HIGHLIGHT OBJECTS AND GIVES AUDIO DESCRIPTION FOR THE PROVIDED INPUT IMAGE.")
142
+ demo.launch()
143
+
144
+ # print(output)
145
+
146
+