# -*-coding:utf-8 -*- """ Base Reader and Document """ import os from dataclasses import dataclass from dataclasses_json import dataclass_json from typing import Any, Dict, List, Optional from glob import glob from build_index.parser import ParserFactory from langchain.docstore.document import Document as LCDocument @dataclass_json @dataclass class Document: text: str = None doc_id: Optional[str] = None embedding: Optional[List[float]] = None extra_info: Optional[Dict[str, Any]] = None def get_text(self): return self.text def get_doc_id(self): return self.doc_id def get_embedding(self): return self.embedding @property def extra_info_str(self) -> Optional[str]: """Extra info string.""" if self.extra_info is None: return None return "\n".join([f"{k}: {str(v)}" for k, v in self.extra_info.items()]) def __post_init__(self): #字段检查 assert self.text is not None, 'Text Field can not be None' def to_langchain_format(self): """Convert struct to LangChain document format.""" metadata = self.extra_info or {} return LCDocument(page_content=self.text, metadata=metadata) class FileReader(object): """ Load file from ./data_dir """ def __init__(self, data_dir=None, folder_name=None, input_files=None, has_meta=True): self.data_dir = data_dir self.has_meta = has_meta if input_files: self.input_files = input_files else: # get all file in data_dir ##TODO: 暂不支持data下recursive dir dir = os.path.join(data_dir, folder_name, '*') self.input_files = glob(dir) print(f'{len(self.input_files)} files in {dir}') print(self.input_files) def load_data(self, concatenate=False) -> List[Document]: data_list = [] metadata_list = [] for file in self.input_files: parser = ParserFactory['pdf'] if parser is None: raise ValueError(f"{file} format doesn't match any sufix supported") try: data, meta = parser.parse_file(file) except Exception as e: print(f'{file} parse failed. error = {e}') continue data_list.append(data) if self.has_meta: metadata_list.append(meta) if concatenate: return [Document("\n".join(data_list))] elif self.has_meta: return [Document(d, extra_info=m) for d, m in zip(data_list, metadata_list)] else: return [Document(d) for d in data_list]