zeeshanali01 commited on
Commit
c8c17e0
1 Parent(s): cd696bb

Upload multimodal_rag.py

Browse files
Files changed (1) hide show
  1. multimodal_rag.py +241 -0
multimodal_rag.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Multimodal RAG.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1RAN3Kvcx2pejp0bdlZ3PFxJC5FgW6Yzz
8
+ """
9
+
10
+ !pip install -q -U transformers==4.37.2
11
+ !pip install -q bitsandbytes==0.41.3 accelerate==0.25.0
12
+ !pip install -q git+https://github.com/openai/whisper.git
13
+ !pip install -q gradio
14
+ !pip install -q gTTS
15
+
16
+ import torch
17
+ from transformers import BitsAndBytesConfig, pipeline
18
+
19
+ quantization_config = BitsAndBytesConfig(
20
+ load_in_4bit=True,
21
+ bnb_4bit_compute_dtype=torch.float16
22
+ )
23
+
24
+ model_id = "llava-hf/llava-1.5-7b-hf"
25
+
26
+ pipe = pipeline("image-to-text",
27
+ model=model_id,
28
+ model_kwargs={"quantization_config": quantization_config})
29
+
30
+ import whisper
31
+ import gradio as gr
32
+ import time
33
+ import warnings
34
+ import os
35
+ from gtts import gTTS
36
+
37
+ from PIL import Image
38
+
39
+ image_path = "img.jpg"
40
+ image = Image.open((image_path))
41
+ image
42
+
43
+ import nltk
44
+ nltk.download('punkt')
45
+ from nltk import sent_tokenize
46
+
47
+ import locale
48
+ print(locale.getlocale()) # Before running the pipeline
49
+ # Run the pipeline
50
+ print(locale.getlocale()) # After running the pipeline
51
+
52
+ max_new_tokens = 200
53
+
54
+ prompt_instructions = """
55
+ Describe the image using as much detail as possible,
56
+ is it a painting, a photograph, what colors are predominant,
57
+ what is the image about?
58
+ """
59
+
60
+ prompt = "USER: <image>\n" + prompt_instructions + "\nASSISTANT:"
61
+
62
+ outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
63
+ # outputs
64
+ # print(outputs[0]["generated_text"])
65
+ for sent in sent_tokenize(outputs[0]["generated_text"]):
66
+ print(sent)
67
+
68
+ warnings.filterwarnings("ignore")
69
+
70
+ import warnings
71
+ from gtts import gTTS
72
+ import numpy as np
73
+
74
+ torch.cuda.is_available()
75
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
76
+ print(f"Using torch {torch.__version__} ({DEVICE})")
77
+
78
+ import whisper
79
+ model = whisper.load_model("medium", device=DEVICE)
80
+ print(
81
+ f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
82
+ f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
83
+ )
84
+
85
+ import re
86
+
87
+ input_text = 'What color is the microphone in image?'
88
+ input_image = 'img.jpg'
89
+
90
+ # load the image
91
+ image = Image.open(input_image)
92
+
93
+ # prompt_instructions = """
94
+ # Describe the image using as much detail as possible, is it a painting, a photograph, what colors are predominant, what is the image about?
95
+ # """
96
+
97
+ # print(input_text)
98
+ prompt_instructions = """
99
+ Act as an expert in imagery descriptive analysis, using as much detail as possible from the image, respond to the following prompt:
100
+ """ + input_text
101
+ prompt = "USER: <image>\n" + prompt_instructions + "\nASSISTANT:"
102
+
103
+ # print(prompt)
104
+
105
+ outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
106
+
107
+ match = re.search(r'ASSISTANT:\s*(.*)', outputs[0]["generated_text"])
108
+
109
+ if match:
110
+ # Extract the text after "ASSISTANT:"
111
+ extracted_text = match.group(1)
112
+ print(extracted_text)
113
+ else:
114
+ print("No match found.")
115
+
116
+ for sent in sent_tokenize(outputs[0]["generated_text"]):
117
+ print(sent)
118
+
119
+ import datetime
120
+ import os
121
+
122
+ ## Logger file
123
+ tstamp = datetime.datetime.now()
124
+ tstamp = str(tstamp).replace(' ','_')
125
+ logfile = f'{tstamp}_log.txt'
126
+ def writehistory(text):
127
+ with open(logfile, 'a', encoding='utf-8') as f:
128
+ f.write(text)
129
+ f.write('\n')
130
+ f.close()
131
+
132
+ import re
133
+ import requests
134
+ from PIL import Image
135
+
136
+ def img2txt(input_text, input_image):
137
+
138
+ # load the image
139
+ image = Image.open(input_image)
140
+
141
+ writehistory(f"Input text: {input_text} - Type: {type(input_text)} - Dir: {dir(input_text)}")
142
+ if type(input_text) == tuple:
143
+ prompt_instructions = """
144
+ Describe the image using as much detail as possible, is it a painting, a photograph, what colors are predominant, what is the image about?
145
+ """
146
+ else:
147
+ prompt_instructions = """
148
+ Act as an expert in imagery descriptive analysis, using as much detail as possible from the image, respond to the following prompt:
149
+ """ + input_text
150
+
151
+ writehistory(f"prompt_instructions: {prompt_instructions}")
152
+ prompt = "USER: <image>\n" + prompt_instructions + "\nASSISTANT:"
153
+
154
+ outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
155
+
156
+ # Properly extract the response text
157
+ if outputs is not None and len(outputs[0]["generated_text"]) > 0:
158
+ match = re.search(r'ASSISTANT:\s*(.*)', outputs[0]["generated_text"])
159
+ if match:
160
+ # Extract the text after "ASSISTANT:"
161
+ reply = match.group(1)
162
+ else:
163
+ reply = "No response found."
164
+ else:
165
+ reply = "No response generated."
166
+
167
+ return reply
168
+
169
+ def transcribe(audio):
170
+
171
+ # Check if the audio input is None or empty
172
+ if audio is None or audio == '':
173
+ return ('','',None) # Return empty strings and None audio file
174
+
175
+ # language = 'en'
176
+
177
+ audio = whisper.load_audio(audio)
178
+ audio = whisper.pad_or_trim(audio)
179
+
180
+ mel = whisper.log_mel_spectrogram(audio).to(model.device)
181
+
182
+ _, probs = model.detect_language(mel)
183
+
184
+ options = whisper.DecodingOptions()
185
+ result = whisper.decode(model, mel, options)
186
+ result_text = result.text
187
+
188
+ return result_text
189
+
190
+ def text_to_speech(text, file_path):
191
+ language = 'en'
192
+
193
+ audioobj = gTTS(text = text,
194
+ lang = language,
195
+ slow = False)
196
+
197
+ audioobj.save(file_path)
198
+
199
+ return file_path
200
+
201
+ !ffmpeg -f lavfi -i anullsrc=r=44100:cl=mono -t 10 -q:a 9 -acodec libmp3lame Temp.mp3
202
+
203
+ import gradio as gr
204
+ import base64
205
+ import os
206
+
207
+ # A function to handle audio and image inputs
208
+ def process_inputs(audio_path, image_path):
209
+ # Process the audio file (assuming this is handled by a function called 'transcribe')
210
+ speech_to_text_output = transcribe(audio_path)
211
+
212
+ # Handle the image input
213
+ if image_path:
214
+ chatgpt_output = img2txt(speech_to_text_output, image_path)
215
+ else:
216
+ chatgpt_output = "No image provided."
217
+
218
+ # Assuming 'transcribe' also returns the path to a processed audio file
219
+ processed_audio_path = text_to_speech(chatgpt_output, "Temp3.mp3") # Replace with actual path if different
220
+
221
+ return speech_to_text_output, chatgpt_output, processed_audio_path
222
+
223
+ # Create the interface
224
+ iface = gr.Interface(
225
+ fn=process_inputs,
226
+ inputs=[
227
+ gr.Audio(sources=["microphone"], type="filepath"),
228
+ gr.Image(type="filepath")
229
+ ],
230
+ outputs=[
231
+ gr.Textbox(label="Speech to Text"),
232
+ gr.Textbox(label="ChatGPT Output"),
233
+ gr.Audio("Temp.mp3")
234
+ ],
235
+ title="Learn OpenAI Whisper: Image processing with Whisper and Llava",
236
+ description="Upload an image and interact via voice input and audio response."
237
+ )
238
+
239
+ # Launch the interface
240
+ iface.launch(debug=True)
241
+