Farid Karimli
commited on
Commit
·
39c29a9
1
Parent(s):
0339679
PyMUPDFReader fix and cleanup
Browse files
code/modules/dataloader/data_loader.py
CHANGED
@@ -105,11 +105,7 @@ class FileReader:
|
|
105 |
return text
|
106 |
|
107 |
def read_pdf(self, temp_file_path: str):
|
108 |
-
|
109 |
-
documents = self.pdf_reader.parse(temp_file_path) # asyncio.run(self.pdf_reader.parse(temp_file_path)) if using async
|
110 |
-
else:
|
111 |
-
loader = self.pdf_reader.get_loader(temp_file_path)
|
112 |
-
documents = self.pdf_reader.get_documents(loader)
|
113 |
return documents
|
114 |
|
115 |
def read_txt(self, temp_file_path: str):
|
@@ -289,7 +285,6 @@ class ChunkProcessor:
|
|
289 |
)
|
290 |
self.document_chunks_full.extend(document_chunks)
|
291 |
|
292 |
-
print(f"Processed {file_path}. File_data: {file_data}")
|
293 |
self.document_data[file_path] = file_data
|
294 |
self.document_metadata[file_path] = file_metadata
|
295 |
|
|
|
105 |
return text
|
106 |
|
107 |
def read_pdf(self, temp_file_path: str):
|
108 |
+
documents = self.pdf_reader.parse(temp_file_path)
|
|
|
|
|
|
|
|
|
109 |
return documents
|
110 |
|
111 |
def read_txt(self, temp_file_path: str):
|
|
|
285 |
)
|
286 |
self.document_chunks_full.extend(document_chunks)
|
287 |
|
|
|
288 |
self.document_data[file_path] = file_data
|
289 |
self.document_metadata[file_path] = file_metadata
|
290 |
|