Spaces:

shimizukawa
/

python-no-senpai

Sleeping

App Files Files Community

python-no-senpai / loaders /rtdhtmlpage.py

shimizukawa

provide djangoproject special rule

4cf5bcf over 1 year ago

raw

history blame

2.7 kB

	from datetime import datetime
	from pathlib import Path
	from typing import Iterator

	from langchain.docstore.document import Document
	from langchain.document_loaders import ReadTheDocsLoader


	class RTDHtmlPageLoader(ReadTheDocsLoader):
	"""directory path for readthedocs documents

	$ wget -r -np -A.html https://docs.djangoproject.com/en/4.2/
	$ python store.py -l rtdhtmlpage django ./docs.djangoproject.com/
	"""
	def __init__(self, inputfile: Path, args, *kwargs):
	kwargs["custom_html_tag"] = ("div", {"id": "docs-content"})
	super().__init__(inputfile, args, *kwargs)

	def _my_clean_data(self, data: str) -> str:
	from bs4 import BeautifulSoup

	soup = BeautifulSoup(data, **self.bs_kwargs)

	# default tags
	html_tags = [
	("div", {"role": "main"}),
	("main", {"id": "main-content"}),
	]

	if self.custom_html_tag is not None:
	html_tags.append(self.custom_html_tag)

	text = None

	# reversed order. check the custom one first
	for tag, attrs in html_tags[::-1]:
	text = soup.find(tag, attrs)
	# if found, break
	if text is not None:
	break

	if text is not None:
	title = "".join(t.text for t in text.find("h1") if t.name!="a")
	text = text.get_text()
	else:
	text = ""
	title = ""

	# trim empty lines
	text = "\n".join([t for t in text.split("\n") if t])

	return text, title

	def lazy_load(self) -> Iterator[Document]:
	"""Load documents."""
	for p in self.file_path.rglob("*"):
	if p.is_dir():
	continue
	# FIXME: utf-8を指定したい
	# with open(p, encoding='utf-8', errors='ignore') as f:
	with open(p, encoding=self.encoding, errors=self.errors) as f:
	text, title = self._my_clean_data(f.read())

	if "docs.djangoproject.com" in p.parts and p.name == "index.html":
	# Djangoドキュメントではindex.htmlにアクセスすると404になる
	p = p.parent
	url = f"https://{str(p)}/"
	else:
	url = f"https://{str(p)}"

	metadata = {
	"title": title,
	"ctime": int(datetime.now().timestamp()),
	"user": "rtd",
	"type": "rtd",
	"url": url,
	"id": str(p),
	}
	# print(metadata)
	yield Document(page_content=text, metadata=metadata)


	def load(self) -> list[Document]:
	return list(self.lazy_load())