XThomasBU
commited on
Commit
·
bbd24f7
1
Parent(s):
36cd3f9
mior updates
Browse files
code/modules/config/config.yml
CHANGED
@@ -37,7 +37,7 @@ llm_params:
|
|
37 |
temperature: 0.7 # float
|
38 |
repo_id: 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF' # HuggingFace repo id
|
39 |
filename: 'tinyllama-1.1b-chat-v1.0.Q5_0.gguf' # Specific name of gguf file in the repo
|
40 |
-
pdf_reader: '
|
41 |
stream: False # bool
|
42 |
|
43 |
chat_logging:
|
|
|
37 |
temperature: 0.7 # float
|
38 |
repo_id: 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF' # HuggingFace repo id
|
39 |
filename: 'tinyllama-1.1b-chat-v1.0.Q5_0.gguf' # Specific name of gguf file in the repo
|
40 |
+
pdf_reader: 'pymupdf' # str [llama, pymupdf, gpt]
|
41 |
stream: False # bool
|
42 |
|
43 |
chat_logging:
|
code/modules/dataloader/data_loader.py
CHANGED
@@ -63,7 +63,7 @@ class HTMLReader:
|
|
63 |
href = href.replace("http", "https")
|
64 |
|
65 |
absolute_url = urljoin(base_url, href)
|
66 |
-
link[
|
67 |
|
68 |
resp = requests.head(absolute_url)
|
69 |
if resp.status_code != 200:
|
@@ -84,6 +84,7 @@ class HTMLReader:
|
|
84 |
else:
|
85 |
return None
|
86 |
|
|
|
87 |
class FileReader:
|
88 |
def __init__(self, logger, kind):
|
89 |
self.logger = logger
|
@@ -95,8 +96,9 @@ class FileReader:
|
|
95 |
else:
|
96 |
self.pdf_reader = PDFReader()
|
97 |
self.web_reader = HTMLReader()
|
98 |
-
self.logger.info(
|
99 |
-
|
|
|
100 |
|
101 |
def extract_text_from_pdf(self, pdf_path):
|
102 |
text = ""
|
@@ -374,7 +376,9 @@ class ChunkProcessor:
|
|
374 |
|
375 |
class DataLoader:
|
376 |
def __init__(self, config, logger=None):
|
377 |
-
self.file_reader = FileReader(
|
|
|
|
|
378 |
self.chunk_processor = ChunkProcessor(config, logger=logger)
|
379 |
|
380 |
def get_chunks(self, uploaded_files, weblinks):
|
@@ -392,19 +396,22 @@ if __name__ == "__main__":
|
|
392 |
with open("../code/modules/config/config.yml", "r") as f:
|
393 |
config = yaml.safe_load(f)
|
394 |
|
395 |
-
STORAGE_DIR = os.path.join(BASE_DIR, config[
|
396 |
uploaded_files = [
|
397 |
-
os.path.join(STORAGE_DIR, file)
|
|
|
|
|
398 |
]
|
399 |
|
400 |
data_loader = DataLoader(config, logger=logger)
|
401 |
document_chunks, document_names, documents, document_metadata = (
|
402 |
data_loader.get_chunks(
|
403 |
-
[
|
|
|
|
|
404 |
[],
|
405 |
)
|
406 |
)
|
407 |
|
408 |
print(document_names[:5])
|
409 |
print(len(document_chunks))
|
410 |
-
|
|
|
63 |
href = href.replace("http", "https")
|
64 |
|
65 |
absolute_url = urljoin(base_url, href)
|
66 |
+
link["href"] = absolute_url
|
67 |
|
68 |
resp = requests.head(absolute_url)
|
69 |
if resp.status_code != 200:
|
|
|
84 |
else:
|
85 |
return None
|
86 |
|
87 |
+
|
88 |
class FileReader:
|
89 |
def __init__(self, logger, kind):
|
90 |
self.logger = logger
|
|
|
96 |
else:
|
97 |
self.pdf_reader = PDFReader()
|
98 |
self.web_reader = HTMLReader()
|
99 |
+
self.logger.info(
|
100 |
+
f"Initialized FileReader with {kind} PDF reader and HTML reader"
|
101 |
+
)
|
102 |
|
103 |
def extract_text_from_pdf(self, pdf_path):
|
104 |
text = ""
|
|
|
376 |
|
377 |
class DataLoader:
|
378 |
def __init__(self, config, logger=None):
|
379 |
+
self.file_reader = FileReader(
|
380 |
+
logger=logger, kind=config["llm_params"]["pdf_reader"]
|
381 |
+
)
|
382 |
self.chunk_processor = ChunkProcessor(config, logger=logger)
|
383 |
|
384 |
def get_chunks(self, uploaded_files, weblinks):
|
|
|
396 |
with open("../code/modules/config/config.yml", "r") as f:
|
397 |
config = yaml.safe_load(f)
|
398 |
|
399 |
+
STORAGE_DIR = os.path.join(BASE_DIR, config["vectorstore"]["data_path"])
|
400 |
uploaded_files = [
|
401 |
+
os.path.join(STORAGE_DIR, file)
|
402 |
+
for file in os.listdir(STORAGE_DIR)
|
403 |
+
if file != "urls.txt"
|
404 |
]
|
405 |
|
406 |
data_loader = DataLoader(config, logger=logger)
|
407 |
document_chunks, document_names, documents, document_metadata = (
|
408 |
data_loader.get_chunks(
|
409 |
+
[
|
410 |
+
"https://dl4ds.github.io/sp2024/static_files/discussion_slides/00_discussion.pdf"
|
411 |
+
],
|
412 |
[],
|
413 |
)
|
414 |
)
|
415 |
|
416 |
print(document_names[:5])
|
417 |
print(len(document_chunks))
|
|