vector_store_api / document_processor.py
JairoDanielMT's picture
Upload 7 files
ea83a52 verified
import os
from langchain_community.document_loaders import (
PyMuPDFLoader,
TextLoader,
Docx2txtLoader,
DirectoryLoader,
)
class DocumentProcessor:
def __init__(self, path: str):
self.path = path
def files_to_texts(self) -> list:
loaders_config = {
"*.pdf": PyMuPDFLoader,
"*.txt": (TextLoader, {"encoding": "utf-8"}),
"*.docx": Docx2txtLoader,
"*.doc": Docx2txtLoader,
}
loaders = [
DirectoryLoader(
path=self.path,
glob=glob,
loader_cls=loader if isinstance(loader, type) else loader[0],
loader_kwargs=loader[1] if isinstance(loader, tuple) else None,
)
for glob, loader in loaders_config.items()
if any(fname.endswith(glob[1:]) for fname in os.listdir(self.path))
]
documents = []
for loader in loaders:
documents.extend(loader.load())
return documents