Spaces:
Runtime error
Runtime error
zeeshanali01
commited on
Commit
•
c8c17e0
1
Parent(s):
cd696bb
Upload multimodal_rag.py
Browse files- multimodal_rag.py +241 -0
multimodal_rag.py
ADDED
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""Multimodal RAG.ipynb
|
3 |
+
|
4 |
+
Automatically generated by Colaboratory.
|
5 |
+
|
6 |
+
Original file is located at
|
7 |
+
https://colab.research.google.com/drive/1RAN3Kvcx2pejp0bdlZ3PFxJC5FgW6Yzz
|
8 |
+
"""
|
9 |
+
|
10 |
+
!pip install -q -U transformers==4.37.2
|
11 |
+
!pip install -q bitsandbytes==0.41.3 accelerate==0.25.0
|
12 |
+
!pip install -q git+https://github.com/openai/whisper.git
|
13 |
+
!pip install -q gradio
|
14 |
+
!pip install -q gTTS
|
15 |
+
|
16 |
+
import torch
|
17 |
+
from transformers import BitsAndBytesConfig, pipeline
|
18 |
+
|
19 |
+
quantization_config = BitsAndBytesConfig(
|
20 |
+
load_in_4bit=True,
|
21 |
+
bnb_4bit_compute_dtype=torch.float16
|
22 |
+
)
|
23 |
+
|
24 |
+
model_id = "llava-hf/llava-1.5-7b-hf"
|
25 |
+
|
26 |
+
pipe = pipeline("image-to-text",
|
27 |
+
model=model_id,
|
28 |
+
model_kwargs={"quantization_config": quantization_config})
|
29 |
+
|
30 |
+
import whisper
|
31 |
+
import gradio as gr
|
32 |
+
import time
|
33 |
+
import warnings
|
34 |
+
import os
|
35 |
+
from gtts import gTTS
|
36 |
+
|
37 |
+
from PIL import Image
|
38 |
+
|
39 |
+
image_path = "img.jpg"
|
40 |
+
image = Image.open((image_path))
|
41 |
+
image
|
42 |
+
|
43 |
+
import nltk
|
44 |
+
nltk.download('punkt')
|
45 |
+
from nltk import sent_tokenize
|
46 |
+
|
47 |
+
import locale
|
48 |
+
print(locale.getlocale()) # Before running the pipeline
|
49 |
+
# Run the pipeline
|
50 |
+
print(locale.getlocale()) # After running the pipeline
|
51 |
+
|
52 |
+
max_new_tokens = 200
|
53 |
+
|
54 |
+
prompt_instructions = """
|
55 |
+
Describe the image using as much detail as possible,
|
56 |
+
is it a painting, a photograph, what colors are predominant,
|
57 |
+
what is the image about?
|
58 |
+
"""
|
59 |
+
|
60 |
+
prompt = "USER: <image>\n" + prompt_instructions + "\nASSISTANT:"
|
61 |
+
|
62 |
+
outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
|
63 |
+
# outputs
|
64 |
+
# print(outputs[0]["generated_text"])
|
65 |
+
for sent in sent_tokenize(outputs[0]["generated_text"]):
|
66 |
+
print(sent)
|
67 |
+
|
68 |
+
warnings.filterwarnings("ignore")
|
69 |
+
|
70 |
+
import warnings
|
71 |
+
from gtts import gTTS
|
72 |
+
import numpy as np
|
73 |
+
|
74 |
+
torch.cuda.is_available()
|
75 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
76 |
+
print(f"Using torch {torch.__version__} ({DEVICE})")
|
77 |
+
|
78 |
+
import whisper
|
79 |
+
model = whisper.load_model("medium", device=DEVICE)
|
80 |
+
print(
|
81 |
+
f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
|
82 |
+
f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
|
83 |
+
)
|
84 |
+
|
85 |
+
import re
|
86 |
+
|
87 |
+
input_text = 'What color is the microphone in image?'
|
88 |
+
input_image = 'img.jpg'
|
89 |
+
|
90 |
+
# load the image
|
91 |
+
image = Image.open(input_image)
|
92 |
+
|
93 |
+
# prompt_instructions = """
|
94 |
+
# Describe the image using as much detail as possible, is it a painting, a photograph, what colors are predominant, what is the image about?
|
95 |
+
# """
|
96 |
+
|
97 |
+
# print(input_text)
|
98 |
+
prompt_instructions = """
|
99 |
+
Act as an expert in imagery descriptive analysis, using as much detail as possible from the image, respond to the following prompt:
|
100 |
+
""" + input_text
|
101 |
+
prompt = "USER: <image>\n" + prompt_instructions + "\nASSISTANT:"
|
102 |
+
|
103 |
+
# print(prompt)
|
104 |
+
|
105 |
+
outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
|
106 |
+
|
107 |
+
match = re.search(r'ASSISTANT:\s*(.*)', outputs[0]["generated_text"])
|
108 |
+
|
109 |
+
if match:
|
110 |
+
# Extract the text after "ASSISTANT:"
|
111 |
+
extracted_text = match.group(1)
|
112 |
+
print(extracted_text)
|
113 |
+
else:
|
114 |
+
print("No match found.")
|
115 |
+
|
116 |
+
for sent in sent_tokenize(outputs[0]["generated_text"]):
|
117 |
+
print(sent)
|
118 |
+
|
119 |
+
import datetime
|
120 |
+
import os
|
121 |
+
|
122 |
+
## Logger file
|
123 |
+
tstamp = datetime.datetime.now()
|
124 |
+
tstamp = str(tstamp).replace(' ','_')
|
125 |
+
logfile = f'{tstamp}_log.txt'
|
126 |
+
def writehistory(text):
|
127 |
+
with open(logfile, 'a', encoding='utf-8') as f:
|
128 |
+
f.write(text)
|
129 |
+
f.write('\n')
|
130 |
+
f.close()
|
131 |
+
|
132 |
+
import re
|
133 |
+
import requests
|
134 |
+
from PIL import Image
|
135 |
+
|
136 |
+
def img2txt(input_text, input_image):
|
137 |
+
|
138 |
+
# load the image
|
139 |
+
image = Image.open(input_image)
|
140 |
+
|
141 |
+
writehistory(f"Input text: {input_text} - Type: {type(input_text)} - Dir: {dir(input_text)}")
|
142 |
+
if type(input_text) == tuple:
|
143 |
+
prompt_instructions = """
|
144 |
+
Describe the image using as much detail as possible, is it a painting, a photograph, what colors are predominant, what is the image about?
|
145 |
+
"""
|
146 |
+
else:
|
147 |
+
prompt_instructions = """
|
148 |
+
Act as an expert in imagery descriptive analysis, using as much detail as possible from the image, respond to the following prompt:
|
149 |
+
""" + input_text
|
150 |
+
|
151 |
+
writehistory(f"prompt_instructions: {prompt_instructions}")
|
152 |
+
prompt = "USER: <image>\n" + prompt_instructions + "\nASSISTANT:"
|
153 |
+
|
154 |
+
outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
|
155 |
+
|
156 |
+
# Properly extract the response text
|
157 |
+
if outputs is not None and len(outputs[0]["generated_text"]) > 0:
|
158 |
+
match = re.search(r'ASSISTANT:\s*(.*)', outputs[0]["generated_text"])
|
159 |
+
if match:
|
160 |
+
# Extract the text after "ASSISTANT:"
|
161 |
+
reply = match.group(1)
|
162 |
+
else:
|
163 |
+
reply = "No response found."
|
164 |
+
else:
|
165 |
+
reply = "No response generated."
|
166 |
+
|
167 |
+
return reply
|
168 |
+
|
169 |
+
def transcribe(audio):
|
170 |
+
|
171 |
+
# Check if the audio input is None or empty
|
172 |
+
if audio is None or audio == '':
|
173 |
+
return ('','',None) # Return empty strings and None audio file
|
174 |
+
|
175 |
+
# language = 'en'
|
176 |
+
|
177 |
+
audio = whisper.load_audio(audio)
|
178 |
+
audio = whisper.pad_or_trim(audio)
|
179 |
+
|
180 |
+
mel = whisper.log_mel_spectrogram(audio).to(model.device)
|
181 |
+
|
182 |
+
_, probs = model.detect_language(mel)
|
183 |
+
|
184 |
+
options = whisper.DecodingOptions()
|
185 |
+
result = whisper.decode(model, mel, options)
|
186 |
+
result_text = result.text
|
187 |
+
|
188 |
+
return result_text
|
189 |
+
|
190 |
+
def text_to_speech(text, file_path):
|
191 |
+
language = 'en'
|
192 |
+
|
193 |
+
audioobj = gTTS(text = text,
|
194 |
+
lang = language,
|
195 |
+
slow = False)
|
196 |
+
|
197 |
+
audioobj.save(file_path)
|
198 |
+
|
199 |
+
return file_path
|
200 |
+
|
201 |
+
!ffmpeg -f lavfi -i anullsrc=r=44100:cl=mono -t 10 -q:a 9 -acodec libmp3lame Temp.mp3
|
202 |
+
|
203 |
+
import gradio as gr
|
204 |
+
import base64
|
205 |
+
import os
|
206 |
+
|
207 |
+
# A function to handle audio and image inputs
|
208 |
+
def process_inputs(audio_path, image_path):
|
209 |
+
# Process the audio file (assuming this is handled by a function called 'transcribe')
|
210 |
+
speech_to_text_output = transcribe(audio_path)
|
211 |
+
|
212 |
+
# Handle the image input
|
213 |
+
if image_path:
|
214 |
+
chatgpt_output = img2txt(speech_to_text_output, image_path)
|
215 |
+
else:
|
216 |
+
chatgpt_output = "No image provided."
|
217 |
+
|
218 |
+
# Assuming 'transcribe' also returns the path to a processed audio file
|
219 |
+
processed_audio_path = text_to_speech(chatgpt_output, "Temp3.mp3") # Replace with actual path if different
|
220 |
+
|
221 |
+
return speech_to_text_output, chatgpt_output, processed_audio_path
|
222 |
+
|
223 |
+
# Create the interface
|
224 |
+
iface = gr.Interface(
|
225 |
+
fn=process_inputs,
|
226 |
+
inputs=[
|
227 |
+
gr.Audio(sources=["microphone"], type="filepath"),
|
228 |
+
gr.Image(type="filepath")
|
229 |
+
],
|
230 |
+
outputs=[
|
231 |
+
gr.Textbox(label="Speech to Text"),
|
232 |
+
gr.Textbox(label="ChatGPT Output"),
|
233 |
+
gr.Audio("Temp.mp3")
|
234 |
+
],
|
235 |
+
title="Learn OpenAI Whisper: Image processing with Whisper and Llava",
|
236 |
+
description="Upload an image and interact via voice input and audio response."
|
237 |
+
)
|
238 |
+
|
239 |
+
# Launch the interface
|
240 |
+
iface.launch(debug=True)
|
241 |
+
|