commit to add lecture pdfs in context
Browse files- code/modules/data_loader.py +34 -2
- code/modules/helpers.py +15 -6
- code/modules/vector_db.py +11 -0
- requirements.txt +1 -0
code/modules/data_loader.py
CHANGED
@@ -48,6 +48,27 @@ class DataLoader:
|
|
48 |
self.splitter = None
|
49 |
logger.info("InfoLoader instance created")
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
def get_chunks(self, uploaded_files, weblinks):
|
52 |
# Main list of all documents
|
53 |
self.document_chunks_full = []
|
@@ -78,6 +99,13 @@ class DataLoader:
|
|
78 |
logger.info(f"\tNumber of pages after skipping: {len(document_chunks)}")
|
79 |
return document_chunks
|
80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
def get_pdf(temp_file_path: str, title: str):
|
82 |
"""
|
83 |
Function to process PDF files
|
@@ -201,7 +229,10 @@ class DataLoader:
|
|
201 |
|
202 |
# Handle different file types
|
203 |
if file_type == "pdf":
|
204 |
-
|
|
|
|
|
|
|
205 |
elif file_type == "txt":
|
206 |
title, document_chunks = get_txt(file_path, file_name)
|
207 |
elif file_type == "docx":
|
@@ -215,7 +246,7 @@ class DataLoader:
|
|
215 |
if self.config["splitter_options"]["remove_chunks"]:
|
216 |
document_chunks = remove_chunks(document_chunks)
|
217 |
|
218 |
-
logger.info(f"\t\tExtracted no. of chunks: {len(document_chunks)}")
|
219 |
self.document_names.append(title)
|
220 |
self.document_chunks_full.extend(document_chunks)
|
221 |
|
@@ -243,6 +274,7 @@ class DataLoader:
|
|
243 |
self.document_chunks_full.extend(document_chunks)
|
244 |
except:
|
245 |
logger.info(f"\t\tError splitting link {link_index+1} : {link}")
|
|
|
246 |
|
247 |
logger.info(
|
248 |
f"\tNumber of document chunks extracted in total: {len(self.document_chunks_full)}\n\n"
|
|
|
48 |
self.splitter = None
|
49 |
logger.info("InfoLoader instance created")
|
50 |
|
51 |
+
def extract_text_from_pdf(self, pdf_path):
|
52 |
+
text = ""
|
53 |
+
with open(pdf_path, "rb") as file:
|
54 |
+
reader = PyPDF2.PdfReader(file)
|
55 |
+
num_pages = len(reader.pages)
|
56 |
+
for page_num in range(num_pages):
|
57 |
+
page = reader.pages[page_num]
|
58 |
+
text += page.extract_text()
|
59 |
+
return text
|
60 |
+
|
61 |
+
def download_pdf_from_url(self, pdf_url):
|
62 |
+
response = requests.get(pdf_url)
|
63 |
+
if response.status_code == 200:
|
64 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
|
65 |
+
temp_file.write(response.content)
|
66 |
+
temp_file_path = temp_file.name
|
67 |
+
return temp_file_path
|
68 |
+
else:
|
69 |
+
print("Failed to download PDF from URL:", pdf_url)
|
70 |
+
return None
|
71 |
+
|
72 |
def get_chunks(self, uploaded_files, weblinks):
|
73 |
# Main list of all documents
|
74 |
self.document_chunks_full = []
|
|
|
99 |
logger.info(f"\tNumber of pages after skipping: {len(document_chunks)}")
|
100 |
return document_chunks
|
101 |
|
102 |
+
def get_pdf_from_url(pdf_url: str):
|
103 |
+
temp_pdf_path = self.download_pdf_from_url(pdf_url)
|
104 |
+
if temp_pdf_path:
|
105 |
+
title, document_chunks = get_pdf(temp_pdf_path, pdf_url)
|
106 |
+
os.remove(temp_pdf_path)
|
107 |
+
return title, document_chunks
|
108 |
+
|
109 |
def get_pdf(temp_file_path: str, title: str):
|
110 |
"""
|
111 |
Function to process PDF files
|
|
|
229 |
|
230 |
# Handle different file types
|
231 |
if file_type == "pdf":
|
232 |
+
try:
|
233 |
+
title, document_chunks = get_pdf(file_path, file_name)
|
234 |
+
except:
|
235 |
+
title, document_chunks = get_pdf_from_url(file_path)
|
236 |
elif file_type == "txt":
|
237 |
title, document_chunks = get_txt(file_path, file_name)
|
238 |
elif file_type == "docx":
|
|
|
246 |
if self.config["splitter_options"]["remove_chunks"]:
|
247 |
document_chunks = remove_chunks(document_chunks)
|
248 |
|
249 |
+
logger.info(f"\t\tExtracted no. of chunks: {len(document_chunks)} from {file_name}")
|
250 |
self.document_names.append(title)
|
251 |
self.document_chunks_full.extend(document_chunks)
|
252 |
|
|
|
274 |
self.document_chunks_full.extend(document_chunks)
|
275 |
except:
|
276 |
logger.info(f"\t\tError splitting link {link_index+1} : {link}")
|
277 |
+
exit()
|
278 |
|
279 |
logger.info(
|
280 |
f"\tNumber of document chunks extracted in total: {len(self.document_chunks_full)}\n\n"
|
code/modules/helpers.py
CHANGED
@@ -36,6 +36,10 @@ class WebpageCrawler:
|
|
36 |
soup = BeautifulSoup(html_data, "html.parser")
|
37 |
list_links = []
|
38 |
for link in soup.find_all("a", href=True):
|
|
|
|
|
|
|
|
|
39 |
# Append to list if new link contains original link
|
40 |
if str(link["href"]).startswith((str(website_link))):
|
41 |
list_links.append(link["href"])
|
@@ -56,14 +60,19 @@ class WebpageCrawler:
|
|
56 |
|
57 |
def get_subpage_links(self, l, base_url):
|
58 |
for link in tqdm(l):
|
59 |
-
|
60 |
-
if
|
61 |
-
dict_links_subpages = self.get_links(link, base_url)
|
62 |
-
# Change the dictionary value of the link to "Checked"
|
63 |
l[link] = "Checked"
|
64 |
-
else:
|
65 |
-
# Create an empty dictionary in case every link is checked
|
66 |
dict_links_subpages = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
# Add new dictionary to old dictionary
|
68 |
l = {**dict_links_subpages, **l}
|
69 |
return l
|
|
|
36 |
soup = BeautifulSoup(html_data, "html.parser")
|
37 |
list_links = []
|
38 |
for link in soup.find_all("a", href=True):
|
39 |
+
|
40 |
+
# clean the link
|
41 |
+
# remove empty spaces
|
42 |
+
link["href"] = link["href"].strip()
|
43 |
# Append to list if new link contains original link
|
44 |
if str(link["href"]).startswith((str(website_link))):
|
45 |
list_links.append(link["href"])
|
|
|
60 |
|
61 |
def get_subpage_links(self, l, base_url):
|
62 |
for link in tqdm(l):
|
63 |
+
print('checking link:', link)
|
64 |
+
if not link.endswith("/"):
|
|
|
|
|
65 |
l[link] = "Checked"
|
|
|
|
|
66 |
dict_links_subpages = {}
|
67 |
+
else:
|
68 |
+
# If not crawled through this page start crawling and get links
|
69 |
+
if l[link] == "Not-checked":
|
70 |
+
dict_links_subpages = self.get_links(link, base_url)
|
71 |
+
# Change the dictionary value of the link to "Checked"
|
72 |
+
l[link] = "Checked"
|
73 |
+
else:
|
74 |
+
# Create an empty dictionary in case every link is checked
|
75 |
+
dict_links_subpages = {}
|
76 |
# Add new dictionary to old dictionary
|
77 |
l = {**dict_links_subpages, **l}
|
78 |
return l
|
code/modules/vector_db.py
CHANGED
@@ -60,6 +60,14 @@ class VectorDB:
|
|
60 |
urls = all_urls
|
61 |
return files, urls
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
def create_embedding_model(self):
|
64 |
self.logger.info("Creating embedding function")
|
65 |
self.embedding_model_loader = EmbeddingModelLoader(self.config)
|
@@ -79,6 +87,9 @@ class VectorDB:
|
|
79 |
data_loader = DataLoader(self.config)
|
80 |
self.logger.info("Loading data")
|
81 |
files, urls = self.load_files()
|
|
|
|
|
|
|
82 |
document_chunks, document_names = data_loader.get_chunks(files, urls)
|
83 |
self.logger.info("Completed loading data")
|
84 |
|
|
|
60 |
urls = all_urls
|
61 |
return files, urls
|
62 |
|
63 |
+
def clean_url_list(self, urls):
|
64 |
+
# get lecture pdf links
|
65 |
+
lecture_pdfs = [link for link in urls if link.endswith(".pdf")]
|
66 |
+
lecture_pdfs = [link for link in lecture_pdfs if "lecture" in link.lower()]
|
67 |
+
urls = [link for link in urls if link.endswith("/")] # only keep links that end with a '/'. Extract Files Seperately
|
68 |
+
|
69 |
+
return urls, lecture_pdfs
|
70 |
+
|
71 |
def create_embedding_model(self):
|
72 |
self.logger.info("Creating embedding function")
|
73 |
self.embedding_model_loader = EmbeddingModelLoader(self.config)
|
|
|
87 |
data_loader = DataLoader(self.config)
|
88 |
self.logger.info("Loading data")
|
89 |
files, urls = self.load_files()
|
90 |
+
urls, lecture_pdfs = self.clean_url_list(urls)
|
91 |
+
files += lecture_pdfs
|
92 |
+
files.remove('storage/data/urls.txt')
|
93 |
document_chunks, document_names = data_loader.get_chunks(files, urls)
|
94 |
self.logger.info("Completed loading data")
|
95 |
|
requirements.txt
CHANGED
@@ -16,3 +16,4 @@ beautifulsoup4==4.12.2
|
|
16 |
fake-useragent==1.4.0
|
17 |
git+https://github.com/huggingface/accelerate.git
|
18 |
llama-cpp-python
|
|
|
|
16 |
fake-useragent==1.4.0
|
17 |
git+https://github.com/huggingface/accelerate.git
|
18 |
llama-cpp-python
|
19 |
+
PyPDF2==3.0.1
|