Spaces:
Sleeping
Sleeping
from datetime import datetime | |
from pathlib import Path | |
from typing import Iterator | |
from langchain.docstore.document import Document | |
from langchain.document_loaders import ReadTheDocsLoader | |
class RTDHtmlPageLoader(ReadTheDocsLoader): | |
"""directory path for readthedocs documents | |
$ wget -r -np -A.html https://docs.djangoproject.com/en/4.2/ | |
$ python store.py -l rtdhtmlpage django ./docs.djangoproject.com/ | |
""" | |
def __init__(self, inputfile: Path, *args, **kwargs): | |
kwargs["custom_html_tag"] = ("div", {"id": "docs-content"}) | |
super().__init__(inputfile, *args, **kwargs) | |
def _my_clean_data(self, data: str) -> str: | |
from bs4 import BeautifulSoup | |
soup = BeautifulSoup(data, **self.bs_kwargs) | |
# default tags | |
html_tags = [ | |
("div", {"role": "main"}), | |
("main", {"id": "main-content"}), | |
] | |
if self.custom_html_tag is not None: | |
html_tags.append(self.custom_html_tag) | |
text = None | |
# reversed order. check the custom one first | |
for tag, attrs in html_tags[::-1]: | |
text = soup.find(tag, attrs) | |
# if found, break | |
if text is not None: | |
break | |
if text is not None: | |
title = "".join(t.text for t in text.find("h1") if t.name!="a") | |
text = text.get_text() | |
else: | |
text = "" | |
title = "" | |
# trim empty lines | |
text = "\n".join([t for t in text.split("\n") if t]) | |
return text, title | |
def lazy_load(self) -> Iterator[Document]: | |
"""Load documents.""" | |
for p in self.file_path.rglob("*"): | |
if p.is_dir(): | |
continue | |
# FIXME: utf-8を指定したい | |
# with open(p, encoding='utf-8', errors='ignore') as f: | |
with open(p, encoding=self.encoding, errors=self.errors) as f: | |
text, title = self._my_clean_data(f.read()) | |
if "docs.djangoproject.com" in p.parts and p.name == "index.html": | |
# Djangoドキュメントではindex.htmlにアクセスすると404になる | |
p = p.parent | |
url = f"https://{str(p)}/" | |
else: | |
url = f"https://{str(p)}" | |
metadata = { | |
"title": title, | |
"ctime": int(datetime.now().timestamp()), | |
"user": "rtd", | |
"type": "rtd", | |
"url": url, | |
"id": str(p), | |
} | |
# print(metadata) | |
yield Document(page_content=text, metadata=metadata) | |
def load(self) -> list[Document]: | |
return list(self.lazy_load()) | |