zeeshanali01 commited on
Commit
b6025d2
1 Parent(s): b76d698

Delete multimodal_rag.py

Browse files
Files changed (1) hide show
  1. multimodal_rag.py +0 -241
multimodal_rag.py DELETED
@@ -1,241 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- """Multimodal RAG.ipynb
3
-
4
- Automatically generated by Colaboratory.
5
-
6
- Original file is located at
7
- https://colab.research.google.com/drive/1RAN3Kvcx2pejp0bdlZ3PFxJC5FgW6Yzz
8
- """
9
-
10
- !pip install -q -U transformers==4.37.2
11
- !pip install -q bitsandbytes==0.41.3 accelerate==0.25.0
12
- !pip install -q git+https://github.com/openai/whisper.git
13
- !pip install -q gradio
14
- !pip install -q gTTS
15
-
16
- import torch
17
- from transformers import BitsAndBytesConfig, pipeline
18
-
19
- quantization_config = BitsAndBytesConfig(
20
- load_in_4bit=True,
21
- bnb_4bit_compute_dtype=torch.float16
22
- )
23
-
24
- model_id = "llava-hf/llava-1.5-7b-hf"
25
-
26
- pipe = pipeline("image-to-text",
27
- model=model_id,
28
- model_kwargs={"quantization_config": quantization_config})
29
-
30
- import whisper
31
- import gradio as gr
32
- import time
33
- import warnings
34
- import os
35
- from gtts import gTTS
36
-
37
- from PIL import Image
38
-
39
- image_path = "img.jpg"
40
- image = Image.open((image_path))
41
- image
42
-
43
- import nltk
44
- nltk.download('punkt')
45
- from nltk import sent_tokenize
46
-
47
- import locale
48
- print(locale.getlocale()) # Before running the pipeline
49
- # Run the pipeline
50
- print(locale.getlocale()) # After running the pipeline
51
-
52
- max_new_tokens = 200
53
-
54
- prompt_instructions = """
55
- Describe the image using as much detail as possible,
56
- is it a painting, a photograph, what colors are predominant,
57
- what is the image about?
58
- """
59
-
60
- prompt = "USER: <image>\n" + prompt_instructions + "\nASSISTANT:"
61
-
62
- outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
63
- # outputs
64
- # print(outputs[0]["generated_text"])
65
- for sent in sent_tokenize(outputs[0]["generated_text"]):
66
- print(sent)
67
-
68
- warnings.filterwarnings("ignore")
69
-
70
- import warnings
71
- from gtts import gTTS
72
- import numpy as np
73
-
74
- torch.cuda.is_available()
75
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
76
- print(f"Using torch {torch.__version__} ({DEVICE})")
77
-
78
- import whisper
79
- model = whisper.load_model("medium", device=DEVICE)
80
- print(
81
- f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
82
- f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
83
- )
84
-
85
- import re
86
-
87
- input_text = 'What color is the microphone in image?'
88
- input_image = 'img.jpg'
89
-
90
- # load the image
91
- image = Image.open(input_image)
92
-
93
- # prompt_instructions = """
94
- # Describe the image using as much detail as possible, is it a painting, a photograph, what colors are predominant, what is the image about?
95
- # """
96
-
97
- # print(input_text)
98
- prompt_instructions = """
99
- Act as an expert in imagery descriptive analysis, using as much detail as possible from the image, respond to the following prompt:
100
- """ + input_text
101
- prompt = "USER: <image>\n" + prompt_instructions + "\nASSISTANT:"
102
-
103
- # print(prompt)
104
-
105
- outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
106
-
107
- match = re.search(r'ASSISTANT:\s*(.*)', outputs[0]["generated_text"])
108
-
109
- if match:
110
- # Extract the text after "ASSISTANT:"
111
- extracted_text = match.group(1)
112
- print(extracted_text)
113
- else:
114
- print("No match found.")
115
-
116
- for sent in sent_tokenize(outputs[0]["generated_text"]):
117
- print(sent)
118
-
119
- import datetime
120
- import os
121
-
122
- ## Logger file
123
- tstamp = datetime.datetime.now()
124
- tstamp = str(tstamp).replace(' ','_')
125
- logfile = f'{tstamp}_log.txt'
126
- def writehistory(text):
127
- with open(logfile, 'a', encoding='utf-8') as f:
128
- f.write(text)
129
- f.write('\n')
130
- f.close()
131
-
132
- import re
133
- import requests
134
- from PIL import Image
135
-
136
- def img2txt(input_text, input_image):
137
-
138
- # load the image
139
- image = Image.open(input_image)
140
-
141
- writehistory(f"Input text: {input_text} - Type: {type(input_text)} - Dir: {dir(input_text)}")
142
- if type(input_text) == tuple:
143
- prompt_instructions = """
144
- Describe the image using as much detail as possible, is it a painting, a photograph, what colors are predominant, what is the image about?
145
- """
146
- else:
147
- prompt_instructions = """
148
- Act as an expert in imagery descriptive analysis, using as much detail as possible from the image, respond to the following prompt:
149
- """ + input_text
150
-
151
- writehistory(f"prompt_instructions: {prompt_instructions}")
152
- prompt = "USER: <image>\n" + prompt_instructions + "\nASSISTANT:"
153
-
154
- outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
155
-
156
- # Properly extract the response text
157
- if outputs is not None and len(outputs[0]["generated_text"]) > 0:
158
- match = re.search(r'ASSISTANT:\s*(.*)', outputs[0]["generated_text"])
159
- if match:
160
- # Extract the text after "ASSISTANT:"
161
- reply = match.group(1)
162
- else:
163
- reply = "No response found."
164
- else:
165
- reply = "No response generated."
166
-
167
- return reply
168
-
169
- def transcribe(audio):
170
-
171
- # Check if the audio input is None or empty
172
- if audio is None or audio == '':
173
- return ('','',None) # Return empty strings and None audio file
174
-
175
- # language = 'en'
176
-
177
- audio = whisper.load_audio(audio)
178
- audio = whisper.pad_or_trim(audio)
179
-
180
- mel = whisper.log_mel_spectrogram(audio).to(model.device)
181
-
182
- _, probs = model.detect_language(mel)
183
-
184
- options = whisper.DecodingOptions()
185
- result = whisper.decode(model, mel, options)
186
- result_text = result.text
187
-
188
- return result_text
189
-
190
- def text_to_speech(text, file_path):
191
- language = 'en'
192
-
193
- audioobj = gTTS(text = text,
194
- lang = language,
195
- slow = False)
196
-
197
- audioobj.save(file_path)
198
-
199
- return file_path
200
-
201
- !ffmpeg -f lavfi -i anullsrc=r=44100:cl=mono -t 10 -q:a 9 -acodec libmp3lame Temp.mp3
202
-
203
- import gradio as gr
204
- import base64
205
- import os
206
-
207
- # A function to handle audio and image inputs
208
- def process_inputs(audio_path, image_path):
209
- # Process the audio file (assuming this is handled by a function called 'transcribe')
210
- speech_to_text_output = transcribe(audio_path)
211
-
212
- # Handle the image input
213
- if image_path:
214
- chatgpt_output = img2txt(speech_to_text_output, image_path)
215
- else:
216
- chatgpt_output = "No image provided."
217
-
218
- # Assuming 'transcribe' also returns the path to a processed audio file
219
- processed_audio_path = text_to_speech(chatgpt_output, "Temp3.mp3") # Replace with actual path if different
220
-
221
- return speech_to_text_output, chatgpt_output, processed_audio_path
222
-
223
- # Create the interface
224
- iface = gr.Interface(
225
- fn=process_inputs,
226
- inputs=[
227
- gr.Audio(sources=["microphone"], type="filepath"),
228
- gr.Image(type="filepath")
229
- ],
230
- outputs=[
231
- gr.Textbox(label="Speech to Text"),
232
- gr.Textbox(label="ChatGPT Output"),
233
- gr.Audio("Temp.mp3")
234
- ],
235
- title="Learn OpenAI Whisper: Image processing with Whisper and Llava",
236
- description="Upload an image and interact via voice input and audio response."
237
- )
238
-
239
- # Launch the interface
240
- iface.launch(debug=True)
241
-