|
|
|
""" |
|
Base Reader and Document |
|
""" |
|
import os |
|
from dataclasses import dataclass |
|
from dataclasses_json import dataclass_json |
|
from typing import Any, Dict, List, Optional |
|
from glob import glob |
|
from build_index.parser import ParserFactory |
|
from langchain.docstore.document import Document as LCDocument |
|
|
|
|
|
@dataclass_json |
|
@dataclass |
|
class Document: |
|
text: str = None |
|
doc_id: Optional[str] = None |
|
embedding: Optional[List[float]] = None |
|
extra_info: Optional[Dict[str, Any]] = None |
|
|
|
def get_text(self): |
|
return self.text |
|
|
|
def get_doc_id(self): |
|
return self.doc_id |
|
|
|
def get_embedding(self): |
|
return self.embedding |
|
|
|
@property |
|
def extra_info_str(self) -> Optional[str]: |
|
"""Extra info string.""" |
|
if self.extra_info is None: |
|
return None |
|
|
|
return "\n".join([f"{k}: {str(v)}" for k, v in self.extra_info.items()]) |
|
|
|
def __post_init__(self): |
|
|
|
assert self.text is not None, 'Text Field can not be None' |
|
|
|
def to_langchain_format(self): |
|
"""Convert struct to LangChain document format.""" |
|
metadata = self.extra_info or {} |
|
return LCDocument(page_content=self.text, metadata=metadata) |
|
|
|
|
|
class FileReader(object): |
|
""" |
|
Load file from ./data_dir |
|
""" |
|
def __init__(self, data_dir=None, folder_name=None, input_files=None, has_meta=True): |
|
self.data_dir = data_dir |
|
self.has_meta = has_meta |
|
|
|
if input_files: |
|
self.input_files = input_files |
|
else: |
|
|
|
|
|
dir = os.path.join(data_dir, folder_name, '*') |
|
self.input_files = glob(dir) |
|
print(f'{len(self.input_files)} files in {dir}') |
|
print(self.input_files) |
|
|
|
def load_data(self, concatenate=False) -> List[Document]: |
|
data_list = [] |
|
metadata_list = [] |
|
for file in self.input_files: |
|
parser = ParserFactory['pdf'] |
|
if parser is None: |
|
raise ValueError(f"{file} format doesn't match any sufix supported") |
|
try: |
|
data, meta = parser.parse_file(file) |
|
except Exception as e: |
|
print(f'{file} parse failed. error = {e}') |
|
continue |
|
data_list.append(data) |
|
if self.has_meta: |
|
metadata_list.append(meta) |
|
|
|
if concatenate: |
|
return [Document("\n".join(data_list))] |
|
elif self.has_meta: |
|
return [Document(d, extra_info=m) for d, m in zip(data_list, metadata_list)] |
|
else: |
|
return [Document(d) for d in data_list] |
|
|