Farid Karimli commited on
Commit
39c29a9
·
1 Parent(s): 0339679

PyMUPDFReader fix and cleanup

Browse files
code/modules/dataloader/data_loader.py CHANGED
@@ -105,11 +105,7 @@ class FileReader:
105
  return text
106
 
107
  def read_pdf(self, temp_file_path: str):
108
- if self.kind == "llama":
109
- documents = self.pdf_reader.parse(temp_file_path) # asyncio.run(self.pdf_reader.parse(temp_file_path)) if using async
110
- else:
111
- loader = self.pdf_reader.get_loader(temp_file_path)
112
- documents = self.pdf_reader.get_documents(loader)
113
  return documents
114
 
115
  def read_txt(self, temp_file_path: str):
@@ -289,7 +285,6 @@ class ChunkProcessor:
289
  )
290
  self.document_chunks_full.extend(document_chunks)
291
 
292
- print(f"Processed {file_path}. File_data: {file_data}")
293
  self.document_data[file_path] = file_data
294
  self.document_metadata[file_path] = file_metadata
295
 
 
105
  return text
106
 
107
  def read_pdf(self, temp_file_path: str):
108
+ documents = self.pdf_reader.parse(temp_file_path)
 
 
 
 
109
  return documents
110
 
111
  def read_txt(self, temp_file_path: str):
 
285
  )
286
  self.document_chunks_full.extend(document_chunks)
287
 
 
288
  self.document_data[file_path] = file_data
289
  self.document_metadata[file_path] = file_metadata
290