NotShrirang commited on
Commit
8ed49ee
·
1 Parent(s): 3783c34

feat: add audio data input

Browse files
app.py CHANGED
@@ -12,7 +12,7 @@ from data_upload import data_upload_page
12
  from data_search import data_search_page
13
  from data_annotations import data_annotation_page
14
  from model_finetuning import model_finetuning_page
15
- from utils import load_clip_model, load_text_embedding_model
16
 
17
  os.environ['KMP_DUPLICATE_LIB_OK']='True'
18
 
@@ -21,6 +21,7 @@ st.set_page_config(layout="wide", page_title="LoomRAG", page_icon="🔍")
21
  device = "cuda" if torch.cuda.is_available() else "cpu"
22
  clip_model, preprocess = load_clip_model()
23
  text_embedding_model = load_text_embedding_model()
 
24
  os.makedirs("annotations/", exist_ok=True)
25
  os.makedirs("images/", exist_ok=True)
26
 
@@ -34,9 +35,9 @@ with st.sidebar:
34
  )
35
 
36
  if page == "Data Upload":
37
- data_upload_page.data_upload(clip_model, preprocess, text_embedding_model)
38
  if page == "Data Search":
39
- data_search_page.data_search(clip_model, preprocess, text_embedding_model, device)
40
  if page == "Data Annotation":
41
  data_annotation_page.data_annotations()
42
  if page == "Model Fine-Tuning":
 
12
  from data_search import data_search_page
13
  from data_annotations import data_annotation_page
14
  from model_finetuning import model_finetuning_page
15
+ from utils import load_clip_model, load_text_embedding_model, load_whisper_model
16
 
17
  os.environ['KMP_DUPLICATE_LIB_OK']='True'
18
 
 
21
  device = "cuda" if torch.cuda.is_available() else "cpu"
22
  clip_model, preprocess = load_clip_model()
23
  text_embedding_model = load_text_embedding_model()
24
+ whisper_model = load_whisper_model()
25
  os.makedirs("annotations/", exist_ok=True)
26
  os.makedirs("images/", exist_ok=True)
27
 
 
35
  )
36
 
37
  if page == "Data Upload":
38
+ data_upload_page.data_upload(clip_model, preprocess, text_embedding_model, whisper_model)
39
  if page == "Data Search":
40
+ data_search_page.data_search(clip_model, preprocess, text_embedding_model, whisper_model, device)
41
  if page == "Data Annotation":
42
  data_annotation_page.data_annotations()
43
  if page == "Model Fine-Tuning":
data_upload/data_upload_page.py CHANGED
@@ -2,15 +2,15 @@ import os
2
  import streamlit as st
3
  import sys
4
 
5
- from data_upload.input_sources_utils import image_util, pdf_util, website_util
6
 
7
  sys.path.append(os.path.dirname(os.path.abspath(__file__)))
8
 
9
 
10
- def data_upload(clip_model, preprocess, text_embedding_model):
11
  st.title("Data Upload")
12
  st.warning("Please note that this is a public application. Make sure you are not uploading any sensitive data.")
13
- upload_choice = st.selectbox(options=["Upload Image", "Add Image from URL / Link", "Upload PDF", "Website Link"], label="Select Upload Type")
14
  if upload_choice == "Upload Image":
15
  image_util.upload_image(clip_model, preprocess)
16
  elif upload_choice == "Add Image from URL / Link":
@@ -19,3 +19,5 @@ def data_upload(clip_model, preprocess, text_embedding_model):
19
  pdf_util.upload_pdf(clip_model, preprocess, text_embedding_model)
20
  elif upload_choice == "Website Link":
21
  website_util.data_from_website(clip_model, preprocess, text_embedding_model)
 
 
 
2
  import streamlit as st
3
  import sys
4
 
5
+ from data_upload.input_sources_utils import image_util, pdf_util, website_util, audio_util
6
 
7
  sys.path.append(os.path.dirname(os.path.abspath(__file__)))
8
 
9
 
10
+ def data_upload(clip_model, preprocess, text_embedding_model, whisper_model):
11
  st.title("Data Upload")
12
  st.warning("Please note that this is a public application. Make sure you are not uploading any sensitive data.")
13
+ upload_choice = st.selectbox(options=["Upload Image", "Add Image from URL / Link", "Upload PDF", "Website Link", "Audio Recording"], label="Select Upload Type")
14
  if upload_choice == "Upload Image":
15
  image_util.upload_image(clip_model, preprocess)
16
  elif upload_choice == "Add Image from URL / Link":
 
19
  pdf_util.upload_pdf(clip_model, preprocess, text_embedding_model)
20
  elif upload_choice == "Website Link":
21
  website_util.data_from_website(clip_model, preprocess, text_embedding_model)
22
+ elif upload_choice == "Audio Recording":
23
+ audio_util.upload_audio(whisper_model, text_embedding_model)
data_upload/input_sources_utils/audio_util.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ import streamlit as st
4
+ import sys
5
+ import whisper
6
+
7
+ from vectordb import add_image_to_index, add_pdf_to_index, add_audio_to_index
8
+
9
+ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
10
+
11
+ def upload_audio(whisper_model, text_embedding_model):
12
+ st.title("Upload Audio")
13
+ recorded_audio = st.audio_input("Record Audio")
14
+ st.write("---")
15
+ uploaded_audios = st.file_uploader("Upload Audio", type=["mp3", "wav"], accept_multiple_files=True)
16
+ if recorded_audio:
17
+ st.audio(recorded_audio)
18
+ if st.button("Add Audio"):
19
+ add_audio_to_index(recorded_audio, whisper_model, text_embedding_model)
20
+ st.success("Audio Added to Database")
21
+ if uploaded_audios:
22
+ for audio in uploaded_audios:
23
+ st.audio(audio)
24
+ if st.button("Add Audio"):
25
+ progress_bar = st.progress(0, f"Adding Audio... | 0/{len(uploaded_audios)}")
26
+ for count, audio in enumerate(uploaded_audios):
27
+ add_audio_to_index(audio, whisper_model, text_embedding_model)
28
+ progress_bar.progress((count + 1) / len(uploaded_audios), f"Adding Audio... | {count + 1}/{len(uploaded_audios)}")
29
+ st.success("Audio Added to Database")
requirements.txt CHANGED
@@ -26,6 +26,7 @@ fonttools==4.55.3
26
  frozenlist==1.5.0
27
  fsspec==2024.9.0
28
  ftfy==6.3.1
 
29
  gitdb==4.0.11
30
  GitPython==3.1.43
31
  greenlet==3.1.1
@@ -48,20 +49,24 @@ langchain-core==0.3.28
48
  langchain-experimental==0.3.4
49
  langchain-text-splitters==0.3.4
50
  langsmith==0.1.147
 
51
  lxml==5.1.0
52
  markdown-it-py==3.0.0
53
  MarkupSafe==3.0.2
54
  marshmallow==3.23.2
55
  matplotlib==3.10.0
56
  mdurl==0.1.2
 
57
  mpmath==1.3.0
58
  multidict==6.1.0
59
  multiprocess==0.70.16
60
  mypy-extensions==1.0.0
61
  narwhals==1.19.1
62
  networkx==3.4.2
 
63
  numpy==1.26.4
64
  open_clip_torch==2.29.0
 
65
  orjson==3.10.12
66
  packaging==24.2
67
  pandas==2.2.3
@@ -100,6 +105,7 @@ streamlit-option-menu==0.4.0
100
  sympy==1.13.1
101
  tenacity==8.5.0
102
  threadpoolctl==3.5.0
 
103
  timm==1.0.12
104
  tokenizers==0.21.0
105
  toml==0.10.2
 
26
  frozenlist==1.5.0
27
  fsspec==2024.9.0
28
  ftfy==6.3.1
29
+ future==1.0.0
30
  gitdb==4.0.11
31
  GitPython==3.1.43
32
  greenlet==3.1.1
 
49
  langchain-experimental==0.3.4
50
  langchain-text-splitters==0.3.4
51
  langsmith==0.1.147
52
+ llvmlite==0.43.0
53
  lxml==5.1.0
54
  markdown-it-py==3.0.0
55
  MarkupSafe==3.0.2
56
  marshmallow==3.23.2
57
  matplotlib==3.10.0
58
  mdurl==0.1.2
59
+ more-itertools==10.5.0
60
  mpmath==1.3.0
61
  multidict==6.1.0
62
  multiprocess==0.70.16
63
  mypy-extensions==1.0.0
64
  narwhals==1.19.1
65
  networkx==3.4.2
66
+ numba==0.60.0
67
  numpy==1.26.4
68
  open_clip_torch==2.29.0
69
+ openai-whisper==20240930
70
  orjson==3.10.12
71
  packaging==24.2
72
  pandas==2.2.3
 
105
  sympy==1.13.1
106
  tenacity==8.5.0
107
  threadpoolctl==3.5.0
108
+ tiktoken==0.8.0
109
  timm==1.0.12
110
  tokenizers==0.21.0
111
  toml==0.10.2
utils.py CHANGED
@@ -6,6 +6,7 @@ import os
6
  from sentence_transformers import SentenceTransformer
7
  import streamlit as st
8
  import torch
 
9
 
10
  device = "cuda" if torch.cuda.is_available() else "cpu"
11
 
@@ -19,6 +20,11 @@ def load_text_embedding_model():
19
  model = SentenceTransformer("all-MiniLM-L6-v2")
20
  return model
21
 
 
 
 
 
 
22
  def load_image_index():
23
  index = faiss.read_index('./vectorstore/image_index.index')
24
  data = pd.read_csv("./vectorstore/image_data.csv")
@@ -29,6 +35,11 @@ def load_text_index():
29
  data = pd.read_csv("./vectorstore/text_data.csv")
30
  return index, data
31
 
 
 
 
 
 
32
  def cosine_similarity(a, b):
33
  return torch.cosine_similarity(a, b)
34
 
 
6
  from sentence_transformers import SentenceTransformer
7
  import streamlit as st
8
  import torch
9
+ import whisper
10
 
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
12
 
 
20
  model = SentenceTransformer("all-MiniLM-L6-v2")
21
  return model
22
 
23
+ @st.cache_resource
24
+ def load_whisper_model():
25
+ model = whisper.load_model("small")
26
+ return model
27
+
28
  def load_image_index():
29
  index = faiss.read_index('./vectorstore/image_index.index')
30
  data = pd.read_csv("./vectorstore/image_data.csv")
 
35
  data = pd.read_csv("./vectorstore/text_data.csv")
36
  return index, data
37
 
38
+ def load_audio_index():
39
+ index = faiss.read_index('./vectorstore/audio_index.index')
40
+ data = pd.read_csv("./vectorstore/audio_data.csv")
41
+ return index, data
42
+
43
  def cosine_similarity(a, b):
44
  return torch.cosine_similarity(a, b)
45
 
vectordb.py CHANGED
@@ -11,14 +11,17 @@ from sentence_transformers import SentenceTransformer
11
  import streamlit as st
12
  import torch
13
  import time
 
14
 
15
  device = "cuda" if torch.cuda.is_available() else "cpu"
16
 
17
  os.makedirs("./vectorstore", exist_ok=True)
18
 
19
- def update_vectordb(index_path: str, embedding: torch.Tensor, image_path: str = None, text_content: str = None):
20
  if not image_path and not text_content:
21
  raise ValueError("Either image_path or text_content must be provided.")
 
 
22
  if not os.path.exists(f"./vectorstore/{index_path}"):
23
  if image_path:
24
  index = faiss.IndexFlatL2(512)
@@ -42,6 +45,15 @@ def update_vectordb(index_path: str, embedding: torch.Tensor, image_path: str =
42
  new_entry_df = pd.DataFrame({"path": image_path, "index": len(df)}, index=[0])
43
  df = pd.concat([df, new_entry_df], ignore_index=True)
44
  df.to_csv("./vectorstore/image_data.csv", index=False)
 
 
 
 
 
 
 
 
 
45
  elif text_content:
46
  if not os.path.exists("./vectorstore/text_data.csv"):
47
  df = pd.DataFrame([{"content": text_content, "index": 0}]).reset_index(drop=True)
@@ -120,6 +132,41 @@ def add_pdf_to_index(pdf, clip_model: clip.model.CLIP, preprocess, text_embeddin
120
  progress_bar.progress(percent_complete, f"Processing Page {page_num + 1}/{len(pdf_reader.pages)}")
121
  return pdf_pages_data
122
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  def search_image_index_with_image(image_features, index: faiss.IndexFlatL2, clip_model: clip.model.CLIP, k: int = 3):
124
  with torch.no_grad():
125
  distances, indices = index.search(image_features.cpu().numpy(), k)
 
11
  import streamlit as st
12
  import torch
13
  import time
14
+ import whisper
15
 
16
  device = "cuda" if torch.cuda.is_available() else "cpu"
17
 
18
  os.makedirs("./vectorstore", exist_ok=True)
19
 
20
+ def update_vectordb(index_path: str, embedding: torch.Tensor, image_path: str = None, text_content: str = None, audio_path: str = None):
21
  if not image_path and not text_content:
22
  raise ValueError("Either image_path or text_content must be provided.")
23
+ if audio_path and not text_content:
24
+ raise ValueError("text_content must be provided when audio_path is provided.")
25
  if not os.path.exists(f"./vectorstore/{index_path}"):
26
  if image_path:
27
  index = faiss.IndexFlatL2(512)
 
45
  new_entry_df = pd.DataFrame({"path": image_path, "index": len(df)}, index=[0])
46
  df = pd.concat([df, new_entry_df], ignore_index=True)
47
  df.to_csv("./vectorstore/image_data.csv", index=False)
48
+ elif audio_path:
49
+ if not os.path.exists("./vectorstore/audio_data.csv"):
50
+ df = pd.DataFrame([{"path": audio_path, "content": text_content, "index": 0}]).reset_index(drop=True)
51
+ df.to_csv("./vectorstore/audio_data.csv", index=False)
52
+ else:
53
+ df = pd.read_csv("./vectorstore/audio_data.csv").reset_index(drop=True)
54
+ new_entry_df = pd.DataFrame({"path": audio_path, "content": text_content, "index": len(df)}, index=[0])
55
+ df = pd.concat([df, new_entry_df], ignore_index=True)
56
+ df.to_csv("./vectorstore/audio_data.csv", index=False)
57
  elif text_content:
58
  if not os.path.exists("./vectorstore/text_data.csv"):
59
  df = pd.DataFrame([{"content": text_content, "index": 0}]).reset_index(drop=True)
 
132
  progress_bar.progress(percent_complete, f"Processing Page {page_num + 1}/{len(pdf_reader.pages)}")
133
  return pdf_pages_data
134
 
135
+
136
+ def add_audio_to_index(audio, whisper_model: whisper.Whisper, text_embedding_model: SentenceTransformer):
137
+ if not os.path.exists("./vectorstore/"):
138
+ os.makedirs("./vectorstore")
139
+ if not os.path.exists("./audio"):
140
+ os.makedirs("./audio")
141
+ if hasattr(audio, "name"):
142
+ audio_name = audio.name
143
+ else:
144
+ audio_name = f"{time.time()}.wav"
145
+ audio_name = audio_name.replace(" ", "_")
146
+ with open(f"./audio/{audio_name}", "wb") as f:
147
+ try:
148
+ f.write(audio.read())
149
+ except:
150
+ if hasattr(audio, "data"):
151
+ audio = io.BytesIO(audio.data)
152
+ else:
153
+ audio = io.BytesIO(audio)
154
+ f.write(audio.read())
155
+ audio_transcript: str = whisper_model.transcribe(f"./audio/{audio_name}")["text"]
156
+ text_splitter = CharacterTextSplitter(
157
+ separator="\n",
158
+ chunk_size=1000,
159
+ chunk_overlap=200,
160
+ length_function=len,
161
+ is_separator_regex=False,
162
+ )
163
+ chunks = text_splitter.split_text(audio_transcript)
164
+ text_embeddings = text_embedding_model.encode(chunks)
165
+ for i, chunk in enumerate(chunks):
166
+ update_vectordb(index_path="audio_index.index", embedding=text_embeddings[i], text_content=chunk, audio_path=f"./audio/{audio_name}")
167
+ return audio_transcript
168
+
169
+
170
  def search_image_index_with_image(image_features, index: faiss.IndexFlatL2, clip_model: clip.model.CLIP, k: int = 3):
171
  with torch.no_grad():
172
  distances, indices = index.search(image_features.cpu().numpy(), k)