python-no-senpai / loaders /rtdhtmlpage.py
shimizukawa's picture
provide djangoproject special rule
4cf5bcf
raw
history blame
2.7 kB
from datetime import datetime
from pathlib import Path
from typing import Iterator
from langchain.docstore.document import Document
from langchain.document_loaders import ReadTheDocsLoader
class RTDHtmlPageLoader(ReadTheDocsLoader):
"""directory path for readthedocs documents
$ wget -r -np -A.html https://docs.djangoproject.com/en/4.2/
$ python store.py -l rtdhtmlpage django ./docs.djangoproject.com/
"""
def __init__(self, inputfile: Path, *args, **kwargs):
kwargs["custom_html_tag"] = ("div", {"id": "docs-content"})
super().__init__(inputfile, *args, **kwargs)
def _my_clean_data(self, data: str) -> str:
from bs4 import BeautifulSoup
soup = BeautifulSoup(data, **self.bs_kwargs)
# default tags
html_tags = [
("div", {"role": "main"}),
("main", {"id": "main-content"}),
]
if self.custom_html_tag is not None:
html_tags.append(self.custom_html_tag)
text = None
# reversed order. check the custom one first
for tag, attrs in html_tags[::-1]:
text = soup.find(tag, attrs)
# if found, break
if text is not None:
break
if text is not None:
title = "".join(t.text for t in text.find("h1") if t.name!="a")
text = text.get_text()
else:
text = ""
title = ""
# trim empty lines
text = "\n".join([t for t in text.split("\n") if t])
return text, title
def lazy_load(self) -> Iterator[Document]:
"""Load documents."""
for p in self.file_path.rglob("*"):
if p.is_dir():
continue
# FIXME: utf-8を指定したい
# with open(p, encoding='utf-8', errors='ignore') as f:
with open(p, encoding=self.encoding, errors=self.errors) as f:
text, title = self._my_clean_data(f.read())
if "docs.djangoproject.com" in p.parts and p.name == "index.html":
# Djangoドキュメントではindex.htmlにアクセスすると404になる
p = p.parent
url = f"https://{str(p)}/"
else:
url = f"https://{str(p)}"
metadata = {
"title": title,
"ctime": int(datetime.now().timestamp()),
"user": "rtd",
"type": "rtd",
"url": url,
"id": str(p),
}
# print(metadata)
yield Document(page_content=text, metadata=metadata)
def load(self) -> list[Document]:
return list(self.lazy_load())