Spaces:
Sleeping
Sleeping
Commit
·
8ed49ee
1
Parent(s):
3783c34
feat: add audio data input
Browse files- app.py +4 -3
- data_upload/data_upload_page.py +5 -3
- data_upload/input_sources_utils/audio_util.py +29 -0
- requirements.txt +6 -0
- utils.py +11 -0
- vectordb.py +48 -1
app.py
CHANGED
@@ -12,7 +12,7 @@ from data_upload import data_upload_page
|
|
12 |
from data_search import data_search_page
|
13 |
from data_annotations import data_annotation_page
|
14 |
from model_finetuning import model_finetuning_page
|
15 |
-
from utils import load_clip_model, load_text_embedding_model
|
16 |
|
17 |
os.environ['KMP_DUPLICATE_LIB_OK']='True'
|
18 |
|
@@ -21,6 +21,7 @@ st.set_page_config(layout="wide", page_title="LoomRAG", page_icon="🔍")
|
|
21 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
22 |
clip_model, preprocess = load_clip_model()
|
23 |
text_embedding_model = load_text_embedding_model()
|
|
|
24 |
os.makedirs("annotations/", exist_ok=True)
|
25 |
os.makedirs("images/", exist_ok=True)
|
26 |
|
@@ -34,9 +35,9 @@ with st.sidebar:
|
|
34 |
)
|
35 |
|
36 |
if page == "Data Upload":
|
37 |
-
data_upload_page.data_upload(clip_model, preprocess, text_embedding_model)
|
38 |
if page == "Data Search":
|
39 |
-
data_search_page.data_search(clip_model, preprocess, text_embedding_model, device)
|
40 |
if page == "Data Annotation":
|
41 |
data_annotation_page.data_annotations()
|
42 |
if page == "Model Fine-Tuning":
|
|
|
12 |
from data_search import data_search_page
|
13 |
from data_annotations import data_annotation_page
|
14 |
from model_finetuning import model_finetuning_page
|
15 |
+
from utils import load_clip_model, load_text_embedding_model, load_whisper_model
|
16 |
|
17 |
os.environ['KMP_DUPLICATE_LIB_OK']='True'
|
18 |
|
|
|
21 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
22 |
clip_model, preprocess = load_clip_model()
|
23 |
text_embedding_model = load_text_embedding_model()
|
24 |
+
whisper_model = load_whisper_model()
|
25 |
os.makedirs("annotations/", exist_ok=True)
|
26 |
os.makedirs("images/", exist_ok=True)
|
27 |
|
|
|
35 |
)
|
36 |
|
37 |
if page == "Data Upload":
|
38 |
+
data_upload_page.data_upload(clip_model, preprocess, text_embedding_model, whisper_model)
|
39 |
if page == "Data Search":
|
40 |
+
data_search_page.data_search(clip_model, preprocess, text_embedding_model, whisper_model, device)
|
41 |
if page == "Data Annotation":
|
42 |
data_annotation_page.data_annotations()
|
43 |
if page == "Model Fine-Tuning":
|
data_upload/data_upload_page.py
CHANGED
@@ -2,15 +2,15 @@ import os
|
|
2 |
import streamlit as st
|
3 |
import sys
|
4 |
|
5 |
-
from data_upload.input_sources_utils import image_util, pdf_util, website_util
|
6 |
|
7 |
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
8 |
|
9 |
|
10 |
-
def data_upload(clip_model, preprocess, text_embedding_model):
|
11 |
st.title("Data Upload")
|
12 |
st.warning("Please note that this is a public application. Make sure you are not uploading any sensitive data.")
|
13 |
-
upload_choice = st.selectbox(options=["Upload Image", "Add Image from URL / Link", "Upload PDF", "Website Link"], label="Select Upload Type")
|
14 |
if upload_choice == "Upload Image":
|
15 |
image_util.upload_image(clip_model, preprocess)
|
16 |
elif upload_choice == "Add Image from URL / Link":
|
@@ -19,3 +19,5 @@ def data_upload(clip_model, preprocess, text_embedding_model):
|
|
19 |
pdf_util.upload_pdf(clip_model, preprocess, text_embedding_model)
|
20 |
elif upload_choice == "Website Link":
|
21 |
website_util.data_from_website(clip_model, preprocess, text_embedding_model)
|
|
|
|
|
|
2 |
import streamlit as st
|
3 |
import sys
|
4 |
|
5 |
+
from data_upload.input_sources_utils import image_util, pdf_util, website_util, audio_util
|
6 |
|
7 |
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
8 |
|
9 |
|
10 |
+
def data_upload(clip_model, preprocess, text_embedding_model, whisper_model):
|
11 |
st.title("Data Upload")
|
12 |
st.warning("Please note that this is a public application. Make sure you are not uploading any sensitive data.")
|
13 |
+
upload_choice = st.selectbox(options=["Upload Image", "Add Image from URL / Link", "Upload PDF", "Website Link", "Audio Recording"], label="Select Upload Type")
|
14 |
if upload_choice == "Upload Image":
|
15 |
image_util.upload_image(clip_model, preprocess)
|
16 |
elif upload_choice == "Add Image from URL / Link":
|
|
|
19 |
pdf_util.upload_pdf(clip_model, preprocess, text_embedding_model)
|
20 |
elif upload_choice == "Website Link":
|
21 |
website_util.data_from_website(clip_model, preprocess, text_embedding_model)
|
22 |
+
elif upload_choice == "Audio Recording":
|
23 |
+
audio_util.upload_audio(whisper_model, text_embedding_model)
|
data_upload/input_sources_utils/audio_util.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import requests
|
3 |
+
import streamlit as st
|
4 |
+
import sys
|
5 |
+
import whisper
|
6 |
+
|
7 |
+
from vectordb import add_image_to_index, add_pdf_to_index, add_audio_to_index
|
8 |
+
|
9 |
+
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
10 |
+
|
11 |
+
def upload_audio(whisper_model, text_embedding_model):
|
12 |
+
st.title("Upload Audio")
|
13 |
+
recorded_audio = st.audio_input("Record Audio")
|
14 |
+
st.write("---")
|
15 |
+
uploaded_audios = st.file_uploader("Upload Audio", type=["mp3", "wav"], accept_multiple_files=True)
|
16 |
+
if recorded_audio:
|
17 |
+
st.audio(recorded_audio)
|
18 |
+
if st.button("Add Audio"):
|
19 |
+
add_audio_to_index(recorded_audio, whisper_model, text_embedding_model)
|
20 |
+
st.success("Audio Added to Database")
|
21 |
+
if uploaded_audios:
|
22 |
+
for audio in uploaded_audios:
|
23 |
+
st.audio(audio)
|
24 |
+
if st.button("Add Audio"):
|
25 |
+
progress_bar = st.progress(0, f"Adding Audio... | 0/{len(uploaded_audios)}")
|
26 |
+
for count, audio in enumerate(uploaded_audios):
|
27 |
+
add_audio_to_index(audio, whisper_model, text_embedding_model)
|
28 |
+
progress_bar.progress((count + 1) / len(uploaded_audios), f"Adding Audio... | {count + 1}/{len(uploaded_audios)}")
|
29 |
+
st.success("Audio Added to Database")
|
requirements.txt
CHANGED
@@ -26,6 +26,7 @@ fonttools==4.55.3
|
|
26 |
frozenlist==1.5.0
|
27 |
fsspec==2024.9.0
|
28 |
ftfy==6.3.1
|
|
|
29 |
gitdb==4.0.11
|
30 |
GitPython==3.1.43
|
31 |
greenlet==3.1.1
|
@@ -48,20 +49,24 @@ langchain-core==0.3.28
|
|
48 |
langchain-experimental==0.3.4
|
49 |
langchain-text-splitters==0.3.4
|
50 |
langsmith==0.1.147
|
|
|
51 |
lxml==5.1.0
|
52 |
markdown-it-py==3.0.0
|
53 |
MarkupSafe==3.0.2
|
54 |
marshmallow==3.23.2
|
55 |
matplotlib==3.10.0
|
56 |
mdurl==0.1.2
|
|
|
57 |
mpmath==1.3.0
|
58 |
multidict==6.1.0
|
59 |
multiprocess==0.70.16
|
60 |
mypy-extensions==1.0.0
|
61 |
narwhals==1.19.1
|
62 |
networkx==3.4.2
|
|
|
63 |
numpy==1.26.4
|
64 |
open_clip_torch==2.29.0
|
|
|
65 |
orjson==3.10.12
|
66 |
packaging==24.2
|
67 |
pandas==2.2.3
|
@@ -100,6 +105,7 @@ streamlit-option-menu==0.4.0
|
|
100 |
sympy==1.13.1
|
101 |
tenacity==8.5.0
|
102 |
threadpoolctl==3.5.0
|
|
|
103 |
timm==1.0.12
|
104 |
tokenizers==0.21.0
|
105 |
toml==0.10.2
|
|
|
26 |
frozenlist==1.5.0
|
27 |
fsspec==2024.9.0
|
28 |
ftfy==6.3.1
|
29 |
+
future==1.0.0
|
30 |
gitdb==4.0.11
|
31 |
GitPython==3.1.43
|
32 |
greenlet==3.1.1
|
|
|
49 |
langchain-experimental==0.3.4
|
50 |
langchain-text-splitters==0.3.4
|
51 |
langsmith==0.1.147
|
52 |
+
llvmlite==0.43.0
|
53 |
lxml==5.1.0
|
54 |
markdown-it-py==3.0.0
|
55 |
MarkupSafe==3.0.2
|
56 |
marshmallow==3.23.2
|
57 |
matplotlib==3.10.0
|
58 |
mdurl==0.1.2
|
59 |
+
more-itertools==10.5.0
|
60 |
mpmath==1.3.0
|
61 |
multidict==6.1.0
|
62 |
multiprocess==0.70.16
|
63 |
mypy-extensions==1.0.0
|
64 |
narwhals==1.19.1
|
65 |
networkx==3.4.2
|
66 |
+
numba==0.60.0
|
67 |
numpy==1.26.4
|
68 |
open_clip_torch==2.29.0
|
69 |
+
openai-whisper==20240930
|
70 |
orjson==3.10.12
|
71 |
packaging==24.2
|
72 |
pandas==2.2.3
|
|
|
105 |
sympy==1.13.1
|
106 |
tenacity==8.5.0
|
107 |
threadpoolctl==3.5.0
|
108 |
+
tiktoken==0.8.0
|
109 |
timm==1.0.12
|
110 |
tokenizers==0.21.0
|
111 |
toml==0.10.2
|
utils.py
CHANGED
@@ -6,6 +6,7 @@ import os
|
|
6 |
from sentence_transformers import SentenceTransformer
|
7 |
import streamlit as st
|
8 |
import torch
|
|
|
9 |
|
10 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
11 |
|
@@ -19,6 +20,11 @@ def load_text_embedding_model():
|
|
19 |
model = SentenceTransformer("all-MiniLM-L6-v2")
|
20 |
return model
|
21 |
|
|
|
|
|
|
|
|
|
|
|
22 |
def load_image_index():
|
23 |
index = faiss.read_index('./vectorstore/image_index.index')
|
24 |
data = pd.read_csv("./vectorstore/image_data.csv")
|
@@ -29,6 +35,11 @@ def load_text_index():
|
|
29 |
data = pd.read_csv("./vectorstore/text_data.csv")
|
30 |
return index, data
|
31 |
|
|
|
|
|
|
|
|
|
|
|
32 |
def cosine_similarity(a, b):
|
33 |
return torch.cosine_similarity(a, b)
|
34 |
|
|
|
6 |
from sentence_transformers import SentenceTransformer
|
7 |
import streamlit as st
|
8 |
import torch
|
9 |
+
import whisper
|
10 |
|
11 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
12 |
|
|
|
20 |
model = SentenceTransformer("all-MiniLM-L6-v2")
|
21 |
return model
|
22 |
|
23 |
+
@st.cache_resource
|
24 |
+
def load_whisper_model():
|
25 |
+
model = whisper.load_model("small")
|
26 |
+
return model
|
27 |
+
|
28 |
def load_image_index():
|
29 |
index = faiss.read_index('./vectorstore/image_index.index')
|
30 |
data = pd.read_csv("./vectorstore/image_data.csv")
|
|
|
35 |
data = pd.read_csv("./vectorstore/text_data.csv")
|
36 |
return index, data
|
37 |
|
38 |
+
def load_audio_index():
|
39 |
+
index = faiss.read_index('./vectorstore/audio_index.index')
|
40 |
+
data = pd.read_csv("./vectorstore/audio_data.csv")
|
41 |
+
return index, data
|
42 |
+
|
43 |
def cosine_similarity(a, b):
|
44 |
return torch.cosine_similarity(a, b)
|
45 |
|
vectordb.py
CHANGED
@@ -11,14 +11,17 @@ from sentence_transformers import SentenceTransformer
|
|
11 |
import streamlit as st
|
12 |
import torch
|
13 |
import time
|
|
|
14 |
|
15 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
16 |
|
17 |
os.makedirs("./vectorstore", exist_ok=True)
|
18 |
|
19 |
-
def update_vectordb(index_path: str, embedding: torch.Tensor, image_path: str = None, text_content: str = None):
|
20 |
if not image_path and not text_content:
|
21 |
raise ValueError("Either image_path or text_content must be provided.")
|
|
|
|
|
22 |
if not os.path.exists(f"./vectorstore/{index_path}"):
|
23 |
if image_path:
|
24 |
index = faiss.IndexFlatL2(512)
|
@@ -42,6 +45,15 @@ def update_vectordb(index_path: str, embedding: torch.Tensor, image_path: str =
|
|
42 |
new_entry_df = pd.DataFrame({"path": image_path, "index": len(df)}, index=[0])
|
43 |
df = pd.concat([df, new_entry_df], ignore_index=True)
|
44 |
df.to_csv("./vectorstore/image_data.csv", index=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
elif text_content:
|
46 |
if not os.path.exists("./vectorstore/text_data.csv"):
|
47 |
df = pd.DataFrame([{"content": text_content, "index": 0}]).reset_index(drop=True)
|
@@ -120,6 +132,41 @@ def add_pdf_to_index(pdf, clip_model: clip.model.CLIP, preprocess, text_embeddin
|
|
120 |
progress_bar.progress(percent_complete, f"Processing Page {page_num + 1}/{len(pdf_reader.pages)}")
|
121 |
return pdf_pages_data
|
122 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
def search_image_index_with_image(image_features, index: faiss.IndexFlatL2, clip_model: clip.model.CLIP, k: int = 3):
|
124 |
with torch.no_grad():
|
125 |
distances, indices = index.search(image_features.cpu().numpy(), k)
|
|
|
11 |
import streamlit as st
|
12 |
import torch
|
13 |
import time
|
14 |
+
import whisper
|
15 |
|
16 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
17 |
|
18 |
os.makedirs("./vectorstore", exist_ok=True)
|
19 |
|
20 |
+
def update_vectordb(index_path: str, embedding: torch.Tensor, image_path: str = None, text_content: str = None, audio_path: str = None):
|
21 |
if not image_path and not text_content:
|
22 |
raise ValueError("Either image_path or text_content must be provided.")
|
23 |
+
if audio_path and not text_content:
|
24 |
+
raise ValueError("text_content must be provided when audio_path is provided.")
|
25 |
if not os.path.exists(f"./vectorstore/{index_path}"):
|
26 |
if image_path:
|
27 |
index = faiss.IndexFlatL2(512)
|
|
|
45 |
new_entry_df = pd.DataFrame({"path": image_path, "index": len(df)}, index=[0])
|
46 |
df = pd.concat([df, new_entry_df], ignore_index=True)
|
47 |
df.to_csv("./vectorstore/image_data.csv", index=False)
|
48 |
+
elif audio_path:
|
49 |
+
if not os.path.exists("./vectorstore/audio_data.csv"):
|
50 |
+
df = pd.DataFrame([{"path": audio_path, "content": text_content, "index": 0}]).reset_index(drop=True)
|
51 |
+
df.to_csv("./vectorstore/audio_data.csv", index=False)
|
52 |
+
else:
|
53 |
+
df = pd.read_csv("./vectorstore/audio_data.csv").reset_index(drop=True)
|
54 |
+
new_entry_df = pd.DataFrame({"path": audio_path, "content": text_content, "index": len(df)}, index=[0])
|
55 |
+
df = pd.concat([df, new_entry_df], ignore_index=True)
|
56 |
+
df.to_csv("./vectorstore/audio_data.csv", index=False)
|
57 |
elif text_content:
|
58 |
if not os.path.exists("./vectorstore/text_data.csv"):
|
59 |
df = pd.DataFrame([{"content": text_content, "index": 0}]).reset_index(drop=True)
|
|
|
132 |
progress_bar.progress(percent_complete, f"Processing Page {page_num + 1}/{len(pdf_reader.pages)}")
|
133 |
return pdf_pages_data
|
134 |
|
135 |
+
|
136 |
+
def add_audio_to_index(audio, whisper_model: whisper.Whisper, text_embedding_model: SentenceTransformer):
|
137 |
+
if not os.path.exists("./vectorstore/"):
|
138 |
+
os.makedirs("./vectorstore")
|
139 |
+
if not os.path.exists("./audio"):
|
140 |
+
os.makedirs("./audio")
|
141 |
+
if hasattr(audio, "name"):
|
142 |
+
audio_name = audio.name
|
143 |
+
else:
|
144 |
+
audio_name = f"{time.time()}.wav"
|
145 |
+
audio_name = audio_name.replace(" ", "_")
|
146 |
+
with open(f"./audio/{audio_name}", "wb") as f:
|
147 |
+
try:
|
148 |
+
f.write(audio.read())
|
149 |
+
except:
|
150 |
+
if hasattr(audio, "data"):
|
151 |
+
audio = io.BytesIO(audio.data)
|
152 |
+
else:
|
153 |
+
audio = io.BytesIO(audio)
|
154 |
+
f.write(audio.read())
|
155 |
+
audio_transcript: str = whisper_model.transcribe(f"./audio/{audio_name}")["text"]
|
156 |
+
text_splitter = CharacterTextSplitter(
|
157 |
+
separator="\n",
|
158 |
+
chunk_size=1000,
|
159 |
+
chunk_overlap=200,
|
160 |
+
length_function=len,
|
161 |
+
is_separator_regex=False,
|
162 |
+
)
|
163 |
+
chunks = text_splitter.split_text(audio_transcript)
|
164 |
+
text_embeddings = text_embedding_model.encode(chunks)
|
165 |
+
for i, chunk in enumerate(chunks):
|
166 |
+
update_vectordb(index_path="audio_index.index", embedding=text_embeddings[i], text_content=chunk, audio_path=f"./audio/{audio_name}")
|
167 |
+
return audio_transcript
|
168 |
+
|
169 |
+
|
170 |
def search_image_index_with_image(image_features, index: faiss.IndexFlatL2, clip_model: clip.model.CLIP, k: int = 3):
|
171 |
with torch.no_grad():
|
172 |
distances, indices = index.search(image_features.cpu().numpy(), k)
|