Spaces:

NotShrirang
/

LoomRAG

Sleeping

App Files Files Community

NotShrirang commited on Jan 4

Commit

8ed49ee

1 Parent(s): 3783c34

feat: add audio data input

Browse files

Files changed (6) hide show

app.py +4 -3
data_upload/data_upload_page.py +5 -3
data_upload/input_sources_utils/audio_util.py +29 -0
requirements.txt +6 -0
utils.py +11 -0
vectordb.py +48 -1

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ from data_upload import data_upload_page
 from data_search import data_search_page
 from data_annotations import data_annotation_page
 from model_finetuning import model_finetuning_page
-from utils import load_clip_model, load_text_embedding_model
 os.environ['KMP_DUPLICATE_LIB_OK']='True'
@@ -21,6 +21,7 @@ st.set_page_config(layout="wide", page_title="LoomRAG", page_icon="🔍")
 device = "cuda" if torch.cuda.is_available() else "cpu"
 clip_model, preprocess = load_clip_model()
 text_embedding_model = load_text_embedding_model()
 os.makedirs("annotations/", exist_ok=True)
 os.makedirs("images/", exist_ok=True)
@@ -34,9 +35,9 @@ with st.sidebar:
     )
 if page == "Data Upload":
-    data_upload_page.data_upload(clip_model, preprocess, text_embedding_model)
 if page == "Data Search":
-    data_search_page.data_search(clip_model, preprocess, text_embedding_model, device)
 if page == "Data Annotation":
     data_annotation_page.data_annotations()
 if page == "Model Fine-Tuning":

 from data_search import data_search_page
 from data_annotations import data_annotation_page
 from model_finetuning import model_finetuning_page
+from utils import load_clip_model, load_text_embedding_model, load_whisper_model
 os.environ['KMP_DUPLICATE_LIB_OK']='True'
 device = "cuda" if torch.cuda.is_available() else "cpu"
 clip_model, preprocess = load_clip_model()
 text_embedding_model = load_text_embedding_model()
+whisper_model = load_whisper_model()
 os.makedirs("annotations/", exist_ok=True)
 os.makedirs("images/", exist_ok=True)
     )
 if page == "Data Upload":
+    data_upload_page.data_upload(clip_model, preprocess, text_embedding_model, whisper_model)
 if page == "Data Search":
+    data_search_page.data_search(clip_model, preprocess, text_embedding_model, whisper_model, device)
 if page == "Data Annotation":
     data_annotation_page.data_annotations()
 if page == "Model Fine-Tuning":

data_upload/data_upload_page.py CHANGED Viewed

@@ -2,15 +2,15 @@ import os
 import streamlit as st
 import sys
-from data_upload.input_sources_utils import image_util, pdf_util, website_util
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-def data_upload(clip_model, preprocess, text_embedding_model):
     st.title("Data Upload")
     st.warning("Please note that this is a public application. Make sure you are not uploading any sensitive data.")
-    upload_choice = st.selectbox(options=["Upload Image", "Add Image from URL / Link", "Upload PDF", "Website Link"], label="Select Upload Type")
     if upload_choice == "Upload Image":
         image_util.upload_image(clip_model, preprocess)
     elif upload_choice == "Add Image from URL / Link":
@@ -19,3 +19,5 @@ def data_upload(clip_model, preprocess, text_embedding_model):
         pdf_util.upload_pdf(clip_model, preprocess, text_embedding_model)
     elif upload_choice == "Website Link":
         website_util.data_from_website(clip_model, preprocess, text_embedding_model)

 import streamlit as st
 import sys
+from data_upload.input_sources_utils import image_util, pdf_util, website_util, audio_util
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+def data_upload(clip_model, preprocess, text_embedding_model, whisper_model):
     st.title("Data Upload")
     st.warning("Please note that this is a public application. Make sure you are not uploading any sensitive data.")
+    upload_choice = st.selectbox(options=["Upload Image", "Add Image from URL / Link", "Upload PDF", "Website Link", "Audio Recording"], label="Select Upload Type")
     if upload_choice == "Upload Image":
         image_util.upload_image(clip_model, preprocess)
     elif upload_choice == "Add Image from URL / Link":
         pdf_util.upload_pdf(clip_model, preprocess, text_embedding_model)
     elif upload_choice == "Website Link":
         website_util.data_from_website(clip_model, preprocess, text_embedding_model)
+    elif upload_choice == "Audio Recording":
+        audio_util.upload_audio(whisper_model, text_embedding_model)

data_upload/input_sources_utils/audio_util.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import os
+import requests
+import streamlit as st
+import sys
+import whisper
+from vectordb import add_image_to_index, add_pdf_to_index, add_audio_to_index
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+def upload_audio(whisper_model, text_embedding_model):
+    st.title("Upload Audio")
+    recorded_audio = st.audio_input("Record Audio")
+    st.write("---")
+    uploaded_audios = st.file_uploader("Upload Audio", type=["mp3", "wav"], accept_multiple_files=True)
+    if recorded_audio:
+        st.audio(recorded_audio)
+        if st.button("Add Audio"):
+            add_audio_to_index(recorded_audio, whisper_model, text_embedding_model)
+            st.success("Audio Added to Database")
+    if uploaded_audios:
+        for audio in uploaded_audios:
+            st.audio(audio)
+        if st.button("Add Audio"):
+            progress_bar = st.progress(0, f"Adding Audio... | 0/{len(uploaded_audios)}")
+            for count, audio in enumerate(uploaded_audios):
+                add_audio_to_index(audio, whisper_model, text_embedding_model)
+                progress_bar.progress((count + 1) / len(uploaded_audios), f"Adding Audio... | {count + 1}/{len(uploaded_audios)}")
+            st.success("Audio Added to Database")

requirements.txt CHANGED Viewed

@@ -26,6 +26,7 @@ fonttools==4.55.3
 frozenlist==1.5.0
 fsspec==2024.9.0
 ftfy==6.3.1
 gitdb==4.0.11
 GitPython==3.1.43
 greenlet==3.1.1
@@ -48,20 +49,24 @@ langchain-core==0.3.28
 langchain-experimental==0.3.4
 langchain-text-splitters==0.3.4
 langsmith==0.1.147
 lxml==5.1.0
 markdown-it-py==3.0.0
 MarkupSafe==3.0.2
 marshmallow==3.23.2
 matplotlib==3.10.0
 mdurl==0.1.2
 mpmath==1.3.0
 multidict==6.1.0
 multiprocess==0.70.16
 mypy-extensions==1.0.0
 narwhals==1.19.1
 networkx==3.4.2
 numpy==1.26.4
 open_clip_torch==2.29.0
 orjson==3.10.12
 packaging==24.2
 pandas==2.2.3
@@ -100,6 +105,7 @@ streamlit-option-menu==0.4.0
 sympy==1.13.1
 tenacity==8.5.0
 threadpoolctl==3.5.0
 timm==1.0.12
 tokenizers==0.21.0
 toml==0.10.2

 frozenlist==1.5.0
 fsspec==2024.9.0
 ftfy==6.3.1
+future==1.0.0
 gitdb==4.0.11
 GitPython==3.1.43
 greenlet==3.1.1
 langchain-experimental==0.3.4
 langchain-text-splitters==0.3.4
 langsmith==0.1.147
+llvmlite==0.43.0
 lxml==5.1.0
 markdown-it-py==3.0.0
 MarkupSafe==3.0.2
 marshmallow==3.23.2
 matplotlib==3.10.0
 mdurl==0.1.2
+more-itertools==10.5.0
 mpmath==1.3.0
 multidict==6.1.0
 multiprocess==0.70.16
 mypy-extensions==1.0.0
 narwhals==1.19.1
 networkx==3.4.2
+numba==0.60.0
 numpy==1.26.4
 open_clip_torch==2.29.0
+openai-whisper==20240930
 orjson==3.10.12
 packaging==24.2
 pandas==2.2.3
 sympy==1.13.1
 tenacity==8.5.0
 threadpoolctl==3.5.0
+tiktoken==0.8.0
 timm==1.0.12
 tokenizers==0.21.0
 toml==0.10.2

utils.py CHANGED Viewed

@@ -6,6 +6,7 @@ import os
 from sentence_transformers import SentenceTransformer
 import streamlit as st
 import torch
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -19,6 +20,11 @@ def load_text_embedding_model():
     model = SentenceTransformer("all-MiniLM-L6-v2")
     return model
 def load_image_index():
     index = faiss.read_index('./vectorstore/image_index.index')
     data = pd.read_csv("./vectorstore/image_data.csv")
@@ -29,6 +35,11 @@ def load_text_index():
     data = pd.read_csv("./vectorstore/text_data.csv")
     return index, data
 def cosine_similarity(a, b):
     return torch.cosine_similarity(a, b)

 from sentence_transformers import SentenceTransformer
 import streamlit as st
 import torch
+import whisper
 device = "cuda" if torch.cuda.is_available() else "cpu"
     model = SentenceTransformer("all-MiniLM-L6-v2")
     return model
+@st.cache_resource
+def load_whisper_model():
+    model = whisper.load_model("small")
+    return model
 def load_image_index():
     index = faiss.read_index('./vectorstore/image_index.index')
     data = pd.read_csv("./vectorstore/image_data.csv")
     data = pd.read_csv("./vectorstore/text_data.csv")
     return index, data
+def load_audio_index():
+    index = faiss.read_index('./vectorstore/audio_index.index')
+    data = pd.read_csv("./vectorstore/audio_data.csv")
+    return index, data
 def cosine_similarity(a, b):
     return torch.cosine_similarity(a, b)

vectordb.py CHANGED Viewed

@@ -11,14 +11,17 @@ from sentence_transformers import SentenceTransformer
 import streamlit as st
 import torch
 import time
 device = "cuda" if torch.cuda.is_available() else "cpu"
 os.makedirs("./vectorstore", exist_ok=True)
-def update_vectordb(index_path: str, embedding: torch.Tensor, image_path: str = None, text_content: str = None):
     if not image_path and not text_content:
         raise ValueError("Either image_path or text_content must be provided.")
     if not os.path.exists(f"./vectorstore/{index_path}"):
         if image_path:
             index = faiss.IndexFlatL2(512)
@@ -42,6 +45,15 @@ def update_vectordb(index_path: str, embedding: torch.Tensor, image_path: str =
             new_entry_df = pd.DataFrame({"path": image_path, "index": len(df)}, index=[0])
             df = pd.concat([df, new_entry_df], ignore_index=True)
             df.to_csv("./vectorstore/image_data.csv", index=False)
     elif text_content:
         if not os.path.exists("./vectorstore/text_data.csv"):
             df = pd.DataFrame([{"content": text_content, "index": 0}]).reset_index(drop=True)
@@ -120,6 +132,41 @@ def add_pdf_to_index(pdf, clip_model: clip.model.CLIP, preprocess, text_embeddin
         progress_bar.progress(percent_complete, f"Processing Page {page_num + 1}/{len(pdf_reader.pages)}")
     return pdf_pages_data
 def search_image_index_with_image(image_features, index: faiss.IndexFlatL2, clip_model: clip.model.CLIP, k: int = 3):
     with torch.no_grad():
         distances, indices = index.search(image_features.cpu().numpy(), k)

 import streamlit as st
 import torch
 import time
+import whisper
 device = "cuda" if torch.cuda.is_available() else "cpu"
 os.makedirs("./vectorstore", exist_ok=True)
+def update_vectordb(index_path: str, embedding: torch.Tensor, image_path: str = None, text_content: str = None, audio_path: str = None):
     if not image_path and not text_content:
         raise ValueError("Either image_path or text_content must be provided.")
+    if audio_path and not text_content:
+        raise ValueError("text_content must be provided when audio_path is provided.")
     if not os.path.exists(f"./vectorstore/{index_path}"):
         if image_path:
             index = faiss.IndexFlatL2(512)
             new_entry_df = pd.DataFrame({"path": image_path, "index": len(df)}, index=[0])
             df = pd.concat([df, new_entry_df], ignore_index=True)
             df.to_csv("./vectorstore/image_data.csv", index=False)
+    elif audio_path:
+        if not os.path.exists("./vectorstore/audio_data.csv"):
+            df = pd.DataFrame([{"path": audio_path, "content": text_content, "index": 0}]).reset_index(drop=True)
+            df.to_csv("./vectorstore/audio_data.csv", index=False)
+        else:
+            df = pd.read_csv("./vectorstore/audio_data.csv").reset_index(drop=True)
+            new_entry_df = pd.DataFrame({"path": audio_path, "content": text_content, "index": len(df)}, index=[0])
+            df = pd.concat([df, new_entry_df], ignore_index=True)
+            df.to_csv("./vectorstore/audio_data.csv", index=False)
     elif text_content:
         if not os.path.exists("./vectorstore/text_data.csv"):
             df = pd.DataFrame([{"content": text_content, "index": 0}]).reset_index(drop=True)
         progress_bar.progress(percent_complete, f"Processing Page {page_num + 1}/{len(pdf_reader.pages)}")
     return pdf_pages_data
+def add_audio_to_index(audio, whisper_model: whisper.Whisper, text_embedding_model: SentenceTransformer):
+    if not os.path.exists("./vectorstore/"):
+        os.makedirs("./vectorstore")
+    if not os.path.exists("./audio"):
+        os.makedirs("./audio")
+    if hasattr(audio, "name"):
+        audio_name = audio.name
+    else:
+        audio_name = f"{time.time()}.wav"
+    audio_name = audio_name.replace(" ", "_")
+    with open(f"./audio/{audio_name}", "wb") as f:
+        try:
+            f.write(audio.read())
+        except:
+            if hasattr(audio, "data"):
+                audio = io.BytesIO(audio.data)
+            else:
+                audio = io.BytesIO(audio)
+            f.write(audio.read())
+    audio_transcript: str = whisper_model.transcribe(f"./audio/{audio_name}")["text"]
+    text_splitter = CharacterTextSplitter(
+        separator="\n",
+        chunk_size=1000,
+        chunk_overlap=200,
+        length_function=len,
+        is_separator_regex=False,
+    )
+    chunks = text_splitter.split_text(audio_transcript)
+    text_embeddings = text_embedding_model.encode(chunks)
+    for i, chunk in enumerate(chunks):
+        update_vectordb(index_path="audio_index.index", embedding=text_embeddings[i], text_content=chunk, audio_path=f"./audio/{audio_name}")
+    return audio_transcript
 def search_image_index_with_image(image_features, index: faiss.IndexFlatL2, clip_model: clip.model.CLIP, k: int = 3):
     with torch.no_grad():
         distances, indices = index.search(image_features.cpu().numpy(), k)