I-AdityaGoyal commited on
Commit
1272fb9
·
verified ·
1 Parent(s): 2a31972

Upload 8 files

Browse files
Files changed (8) hide show
  1. app.py +93 -0
  2. faiss_indexing.py +20 -0
  3. pdf_generator.py +23 -0
  4. pdf_processing.py +14 -0
  5. requirements.txt +12 -0
  6. text_to_speech.py +6 -0
  7. utils.py +26 -0
  8. youtube_processing.py +16 -0
app.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+
4
+ from pdf_processing import extract_text_from_pdf
5
+ from youtube_processing import extract_text_from_youtube
6
+ from faiss_indexing import get_embeddings, create_faiss_index, query_faiss_index
7
+ from utils import load_environment_variables, query_huggingface_api, chunk_text
8
+ from pdf_generator import generate_pdf
9
+ from text_to_speech import speak_text
10
+ from sentence_transformers import SentenceTransformer
11
+
12
+ # Load environment variables
13
+ hf_token = load_environment_variables()
14
+ if not hf_token:
15
+ st.error("Hugging Face API token is missing. Please add it to your .env file.")
16
+ st.stop()
17
+
18
+ # Define the Hugging Face API endpoint
19
+ API_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2"
20
+ headers = {
21
+ "Authorization": f"Bearer {hf_token}"
22
+ }
23
+
24
+ # Initialize the sentence transformer model
25
+ model_name = 'all-MiniLM-L6-v2'
26
+ model = SentenceTransformer(model_name)
27
+
28
+ # Streamlit UI
29
+ st.title("NoteBot - Notes Retrieval System")
30
+ st.write("By - Aditya Goyal")
31
+ st.write("Upload PDFs or provide YouTube links to ask questions about their content.")
32
+
33
+ uploaded_files = st.file_uploader("Upload PDF files", type="pdf", accept_multiple_files=True)
34
+ youtube_url = st.text_input("Enter YouTube video URL:")
35
+
36
+ all_chunks = []
37
+
38
+ # Process PDF files
39
+ if uploaded_files:
40
+ for uploaded_file in uploaded_files:
41
+ pdf_path = os.path.join("temp", uploaded_file.name)
42
+ if not os.path.exists("temp"):
43
+ os.makedirs("temp")
44
+ with open(pdf_path, "wb") as f:
45
+ f.write(uploaded_file.getbuffer())
46
+ text = extract_text_from_pdf(pdf_path)
47
+ chunks = chunk_text(text)
48
+ all_chunks.extend(chunks)
49
+
50
+ # Process YouTube video
51
+ if youtube_url:
52
+ yt_text = extract_text_from_youtube(youtube_url)
53
+ yt_chunks = chunk_text(yt_text)
54
+ all_chunks.extend(yt_chunks)
55
+
56
+ if all_chunks:
57
+ embeddings = get_embeddings(all_chunks, model)
58
+ faiss_index = create_faiss_index(embeddings)
59
+
60
+ query_text = st.text_input("Enter your query:")
61
+ if query_text:
62
+ query_embedding = get_embeddings([query_text], model)
63
+ distances, indices = query_faiss_index(faiss_index, query_embedding)
64
+ similar_chunks = [all_chunks[i] for i in indices[0]]
65
+
66
+ # Ensure we only use a manageable number of chunks
67
+ num_chunks_to_use = min(5, len(similar_chunks))
68
+ selected_chunks = similar_chunks[:num_chunks_to_use]
69
+
70
+ template = """Based on the following chunks: {similar_chunks}
71
+ Question: {question}
72
+ Answer:"""
73
+
74
+ prompt_text = template.format(similar_chunks="\n".join(selected_chunks), question=query_text)
75
+
76
+ # Generate response from Hugging Face API
77
+ response = query_huggingface_api(prompt_text, API_URL, headers)
78
+
79
+ if "Error" not in response:
80
+ st.write("**Answer:**", response)
81
+
82
+ # Add button to download response as PDF
83
+ if st.button("Download Response as PDF"):
84
+ pdf_path = os.path.join("temp", "response.pdf")
85
+ generate_pdf(response, pdf_path)
86
+ with open(pdf_path, "rb") as f:
87
+ st.download_button(label="Download PDF", data=f, file_name="response.pdf")
88
+
89
+ # Add button to speak the response text
90
+ if st.button("Speak Response"):
91
+ speak_text(response)
92
+ else:
93
+ st.error(response)
faiss_indexing.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import faiss
2
+ import numpy as np
3
+ from sentence_transformers import SentenceTransformer
4
+
5
+ def get_embeddings(texts, model):
6
+ embeddings = model.encode(texts, convert_to_tensor=True)
7
+ return embeddings
8
+
9
+ def create_faiss_index(embeddings):
10
+ embeddings_np = embeddings.cpu().numpy() # Move to CPU and convert to numpy
11
+ dim = embeddings_np.shape[1]
12
+ index = faiss.IndexFlatL2(dim)
13
+ faiss_index = faiss.IndexIDMap(index)
14
+ faiss_index.add_with_ids(embeddings_np, np.arange(len(embeddings_np)))
15
+ return faiss_index
16
+
17
+ def query_faiss_index(index, query_embedding, k=5):
18
+ query_embedding_np = query_embedding.cpu().numpy() # Move to CPU and convert to numpy
19
+ distances, indices = index.search(query_embedding_np, k)
20
+ return distances, indices
pdf_generator.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fpdf import FPDF
2
+
3
+ class PDF(FPDF):
4
+ def header(self):
5
+ self.set_font('Arial', 'B', 12)
6
+ self.cell(0, 10, 'NoteBot Response', 0, 1, 'C')
7
+
8
+ def chapter_title(self, title):
9
+ self.set_font('Arial', 'B', 12)
10
+ self.cell(0, 10, title, 0, 1, 'L')
11
+ self.ln(10)
12
+
13
+ def chapter_body(self, body):
14
+ self.set_font('Arial', '', 12)
15
+ self.multi_cell(0, 10, body)
16
+ self.ln()
17
+
18
+ def generate_pdf(text, path):
19
+ pdf = PDF()
20
+ pdf.add_page()
21
+ pdf.chapter_title('Response:')
22
+ pdf.chapter_body(text)
23
+ pdf.output(path)
pdf_processing.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+
3
+ def extract_text_from_pdf(pdf_path):
4
+ try:
5
+ pdf_document = fitz.open(pdf_path)
6
+ text = ""
7
+ for page_num in range(len(pdf_document)):
8
+ page = pdf_document.load_page(page_num)
9
+ text += page.get_text()
10
+ pdf_document.close()
11
+ return text
12
+ except Exception as e:
13
+ print(f"Error extracting text from PDF: {e}")
14
+ return ""
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ PyMuPDF
3
+ numpy
4
+ faiss-cpu
5
+ sentence-transformers
6
+ python-dotenv
7
+ requests
8
+ langchain
9
+ youtube-transcript-api
10
+ speechrecognition
11
+ fpdf
12
+ pyttsx3
text_to_speech.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import pyttsx3
2
+
3
+ def speak_text(text):
4
+ engine = pyttsx3.init()
5
+ engine.say(text)
6
+ engine.runAndWait()
utils.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ def load_environment_variables():
5
+ load_dotenv()
6
+ hf_token = os.getenv("HF_TOKEN")
7
+ return hf_token
8
+
9
+ def query_huggingface_api(prompt, api_url, headers):
10
+ import requests
11
+ response = requests.post(api_url, headers=headers, json={"inputs": prompt})
12
+ if response.status_code == 200:
13
+ generated_text = response.json()[0]['generated_text']
14
+ # Extract only the final answer
15
+ answer_start = generated_text.find("Answer: ")
16
+ if answer_start != -1:
17
+ answer = generated_text[answer_start + len("Answer: "):].strip()
18
+ else:
19
+ answer = generated_text
20
+ return answer
21
+ else:
22
+ return f"Error {response.status_code}: {response.text}"
23
+
24
+ def chunk_text(text, chunk_size=1000):
25
+ chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
26
+ return chunks
youtube_processing.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from youtube_transcript_api import YouTubeTranscriptApi
2
+ import re
3
+
4
+ def extract_text_from_youtube(video_url):
5
+ video_id = re.search(r"(?<=v=)[^&#]+", video_url)
6
+ if not video_id:
7
+ return ""
8
+
9
+ video_id = video_id.group(0)
10
+ try:
11
+ transcript = YouTubeTranscriptApi.get_transcript(video_id)
12
+ text = " ".join([item['text'] for item in transcript])
13
+ return text
14
+ except Exception as e:
15
+ print(f"Error fetching transcript: {e}")
16
+ return ""