Spaces:
Runtime error
Runtime error
zeeshanali01
commited on
Commit
•
b6025d2
1
Parent(s):
b76d698
Delete multimodal_rag.py
Browse files- multimodal_rag.py +0 -241
multimodal_rag.py
DELETED
@@ -1,241 +0,0 @@
|
|
1 |
-
# -*- coding: utf-8 -*-
|
2 |
-
"""Multimodal RAG.ipynb
|
3 |
-
|
4 |
-
Automatically generated by Colaboratory.
|
5 |
-
|
6 |
-
Original file is located at
|
7 |
-
https://colab.research.google.com/drive/1RAN3Kvcx2pejp0bdlZ3PFxJC5FgW6Yzz
|
8 |
-
"""
|
9 |
-
|
10 |
-
!pip install -q -U transformers==4.37.2
|
11 |
-
!pip install -q bitsandbytes==0.41.3 accelerate==0.25.0
|
12 |
-
!pip install -q git+https://github.com/openai/whisper.git
|
13 |
-
!pip install -q gradio
|
14 |
-
!pip install -q gTTS
|
15 |
-
|
16 |
-
import torch
|
17 |
-
from transformers import BitsAndBytesConfig, pipeline
|
18 |
-
|
19 |
-
quantization_config = BitsAndBytesConfig(
|
20 |
-
load_in_4bit=True,
|
21 |
-
bnb_4bit_compute_dtype=torch.float16
|
22 |
-
)
|
23 |
-
|
24 |
-
model_id = "llava-hf/llava-1.5-7b-hf"
|
25 |
-
|
26 |
-
pipe = pipeline("image-to-text",
|
27 |
-
model=model_id,
|
28 |
-
model_kwargs={"quantization_config": quantization_config})
|
29 |
-
|
30 |
-
import whisper
|
31 |
-
import gradio as gr
|
32 |
-
import time
|
33 |
-
import warnings
|
34 |
-
import os
|
35 |
-
from gtts import gTTS
|
36 |
-
|
37 |
-
from PIL import Image
|
38 |
-
|
39 |
-
image_path = "img.jpg"
|
40 |
-
image = Image.open((image_path))
|
41 |
-
image
|
42 |
-
|
43 |
-
import nltk
|
44 |
-
nltk.download('punkt')
|
45 |
-
from nltk import sent_tokenize
|
46 |
-
|
47 |
-
import locale
|
48 |
-
print(locale.getlocale()) # Before running the pipeline
|
49 |
-
# Run the pipeline
|
50 |
-
print(locale.getlocale()) # After running the pipeline
|
51 |
-
|
52 |
-
max_new_tokens = 200
|
53 |
-
|
54 |
-
prompt_instructions = """
|
55 |
-
Describe the image using as much detail as possible,
|
56 |
-
is it a painting, a photograph, what colors are predominant,
|
57 |
-
what is the image about?
|
58 |
-
"""
|
59 |
-
|
60 |
-
prompt = "USER: <image>\n" + prompt_instructions + "\nASSISTANT:"
|
61 |
-
|
62 |
-
outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
|
63 |
-
# outputs
|
64 |
-
# print(outputs[0]["generated_text"])
|
65 |
-
for sent in sent_tokenize(outputs[0]["generated_text"]):
|
66 |
-
print(sent)
|
67 |
-
|
68 |
-
warnings.filterwarnings("ignore")
|
69 |
-
|
70 |
-
import warnings
|
71 |
-
from gtts import gTTS
|
72 |
-
import numpy as np
|
73 |
-
|
74 |
-
torch.cuda.is_available()
|
75 |
-
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
76 |
-
print(f"Using torch {torch.__version__} ({DEVICE})")
|
77 |
-
|
78 |
-
import whisper
|
79 |
-
model = whisper.load_model("medium", device=DEVICE)
|
80 |
-
print(
|
81 |
-
f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
|
82 |
-
f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
|
83 |
-
)
|
84 |
-
|
85 |
-
import re
|
86 |
-
|
87 |
-
input_text = 'What color is the microphone in image?'
|
88 |
-
input_image = 'img.jpg'
|
89 |
-
|
90 |
-
# load the image
|
91 |
-
image = Image.open(input_image)
|
92 |
-
|
93 |
-
# prompt_instructions = """
|
94 |
-
# Describe the image using as much detail as possible, is it a painting, a photograph, what colors are predominant, what is the image about?
|
95 |
-
# """
|
96 |
-
|
97 |
-
# print(input_text)
|
98 |
-
prompt_instructions = """
|
99 |
-
Act as an expert in imagery descriptive analysis, using as much detail as possible from the image, respond to the following prompt:
|
100 |
-
""" + input_text
|
101 |
-
prompt = "USER: <image>\n" + prompt_instructions + "\nASSISTANT:"
|
102 |
-
|
103 |
-
# print(prompt)
|
104 |
-
|
105 |
-
outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
|
106 |
-
|
107 |
-
match = re.search(r'ASSISTANT:\s*(.*)', outputs[0]["generated_text"])
|
108 |
-
|
109 |
-
if match:
|
110 |
-
# Extract the text after "ASSISTANT:"
|
111 |
-
extracted_text = match.group(1)
|
112 |
-
print(extracted_text)
|
113 |
-
else:
|
114 |
-
print("No match found.")
|
115 |
-
|
116 |
-
for sent in sent_tokenize(outputs[0]["generated_text"]):
|
117 |
-
print(sent)
|
118 |
-
|
119 |
-
import datetime
|
120 |
-
import os
|
121 |
-
|
122 |
-
## Logger file
|
123 |
-
tstamp = datetime.datetime.now()
|
124 |
-
tstamp = str(tstamp).replace(' ','_')
|
125 |
-
logfile = f'{tstamp}_log.txt'
|
126 |
-
def writehistory(text):
|
127 |
-
with open(logfile, 'a', encoding='utf-8') as f:
|
128 |
-
f.write(text)
|
129 |
-
f.write('\n')
|
130 |
-
f.close()
|
131 |
-
|
132 |
-
import re
|
133 |
-
import requests
|
134 |
-
from PIL import Image
|
135 |
-
|
136 |
-
def img2txt(input_text, input_image):
|
137 |
-
|
138 |
-
# load the image
|
139 |
-
image = Image.open(input_image)
|
140 |
-
|
141 |
-
writehistory(f"Input text: {input_text} - Type: {type(input_text)} - Dir: {dir(input_text)}")
|
142 |
-
if type(input_text) == tuple:
|
143 |
-
prompt_instructions = """
|
144 |
-
Describe the image using as much detail as possible, is it a painting, a photograph, what colors are predominant, what is the image about?
|
145 |
-
"""
|
146 |
-
else:
|
147 |
-
prompt_instructions = """
|
148 |
-
Act as an expert in imagery descriptive analysis, using as much detail as possible from the image, respond to the following prompt:
|
149 |
-
""" + input_text
|
150 |
-
|
151 |
-
writehistory(f"prompt_instructions: {prompt_instructions}")
|
152 |
-
prompt = "USER: <image>\n" + prompt_instructions + "\nASSISTANT:"
|
153 |
-
|
154 |
-
outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
|
155 |
-
|
156 |
-
# Properly extract the response text
|
157 |
-
if outputs is not None and len(outputs[0]["generated_text"]) > 0:
|
158 |
-
match = re.search(r'ASSISTANT:\s*(.*)', outputs[0]["generated_text"])
|
159 |
-
if match:
|
160 |
-
# Extract the text after "ASSISTANT:"
|
161 |
-
reply = match.group(1)
|
162 |
-
else:
|
163 |
-
reply = "No response found."
|
164 |
-
else:
|
165 |
-
reply = "No response generated."
|
166 |
-
|
167 |
-
return reply
|
168 |
-
|
169 |
-
def transcribe(audio):
|
170 |
-
|
171 |
-
# Check if the audio input is None or empty
|
172 |
-
if audio is None or audio == '':
|
173 |
-
return ('','',None) # Return empty strings and None audio file
|
174 |
-
|
175 |
-
# language = 'en'
|
176 |
-
|
177 |
-
audio = whisper.load_audio(audio)
|
178 |
-
audio = whisper.pad_or_trim(audio)
|
179 |
-
|
180 |
-
mel = whisper.log_mel_spectrogram(audio).to(model.device)
|
181 |
-
|
182 |
-
_, probs = model.detect_language(mel)
|
183 |
-
|
184 |
-
options = whisper.DecodingOptions()
|
185 |
-
result = whisper.decode(model, mel, options)
|
186 |
-
result_text = result.text
|
187 |
-
|
188 |
-
return result_text
|
189 |
-
|
190 |
-
def text_to_speech(text, file_path):
|
191 |
-
language = 'en'
|
192 |
-
|
193 |
-
audioobj = gTTS(text = text,
|
194 |
-
lang = language,
|
195 |
-
slow = False)
|
196 |
-
|
197 |
-
audioobj.save(file_path)
|
198 |
-
|
199 |
-
return file_path
|
200 |
-
|
201 |
-
!ffmpeg -f lavfi -i anullsrc=r=44100:cl=mono -t 10 -q:a 9 -acodec libmp3lame Temp.mp3
|
202 |
-
|
203 |
-
import gradio as gr
|
204 |
-
import base64
|
205 |
-
import os
|
206 |
-
|
207 |
-
# A function to handle audio and image inputs
|
208 |
-
def process_inputs(audio_path, image_path):
|
209 |
-
# Process the audio file (assuming this is handled by a function called 'transcribe')
|
210 |
-
speech_to_text_output = transcribe(audio_path)
|
211 |
-
|
212 |
-
# Handle the image input
|
213 |
-
if image_path:
|
214 |
-
chatgpt_output = img2txt(speech_to_text_output, image_path)
|
215 |
-
else:
|
216 |
-
chatgpt_output = "No image provided."
|
217 |
-
|
218 |
-
# Assuming 'transcribe' also returns the path to a processed audio file
|
219 |
-
processed_audio_path = text_to_speech(chatgpt_output, "Temp3.mp3") # Replace with actual path if different
|
220 |
-
|
221 |
-
return speech_to_text_output, chatgpt_output, processed_audio_path
|
222 |
-
|
223 |
-
# Create the interface
|
224 |
-
iface = gr.Interface(
|
225 |
-
fn=process_inputs,
|
226 |
-
inputs=[
|
227 |
-
gr.Audio(sources=["microphone"], type="filepath"),
|
228 |
-
gr.Image(type="filepath")
|
229 |
-
],
|
230 |
-
outputs=[
|
231 |
-
gr.Textbox(label="Speech to Text"),
|
232 |
-
gr.Textbox(label="ChatGPT Output"),
|
233 |
-
gr.Audio("Temp.mp3")
|
234 |
-
],
|
235 |
-
title="Learn OpenAI Whisper: Image processing with Whisper and Llava",
|
236 |
-
description="Upload an image and interact via voice input and audio response."
|
237 |
-
)
|
238 |
-
|
239 |
-
# Launch the interface
|
240 |
-
iface.launch(debug=True)
|
241 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|