fahmiaziz98
init
26de2cd
raw
history blame
1.41 kB
import uuid
import logging
from typing import List
from langchain_community.document_loaders import SeleniumURLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
# add logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# get document
def load_and_split_docs(urls: List[str]):
MARKDOWN_SEPARATORS = [
"\n#{1,6} ",
"```\n",
"\n\\*\\*\\*+\n",
"\n---+\n",
"\n___+\n",
"\n\n",
"\n",
" ",
"",
]
logger.info("Extracting web loader...")
loader = SeleniumURLLoader(urls=urls)
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
chunk_size=512, # The maximum number of characters in a chunk: we selected this value arbitrarily
chunk_overlap=50, # The number of characters to overlap between chunks
add_start_index=True, # If `True`, includes chunk's start index in metadata
strip_whitespace=True, # If `True`, strips whitespace from the start and end of every document
separators=MARKDOWN_SEPARATORS,
)
logger.info("Split and documnets...")
docs_split = text_splitter.split_documents(docs)
for i, doc in enumerate(docs_split):
doc.metadata['id'] = str(uuid.uuid4())[:4]
doc.metadata['chunk-id'] = str(uuid.uuid4())[-4:]
return docs_split