Spaces:
Running
Running
import uuid | |
import logging | |
from typing import List | |
from langchain_community.document_loaders import SeleniumURLLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
# add logger | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
# get document | |
def load_and_split_docs(urls: List[str]): | |
MARKDOWN_SEPARATORS = [ | |
"\n#{1,6} ", | |
"```\n", | |
"\n\\*\\*\\*+\n", | |
"\n---+\n", | |
"\n___+\n", | |
"\n\n", | |
"\n", | |
" ", | |
"", | |
] | |
logger.info("Extracting web loader...") | |
loader = SeleniumURLLoader(urls=urls) | |
docs = loader.load() | |
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( | |
chunk_size=512, # The maximum number of characters in a chunk: we selected this value arbitrarily | |
chunk_overlap=50, # The number of characters to overlap between chunks | |
add_start_index=True, # If `True`, includes chunk's start index in metadata | |
strip_whitespace=True, # If `True`, strips whitespace from the start and end of every document | |
separators=MARKDOWN_SEPARATORS, | |
) | |
logger.info("Split and documnets...") | |
docs_split = text_splitter.split_documents(docs) | |
for i, doc in enumerate(docs_split): | |
doc.metadata['id'] = str(uuid.uuid4())[:4] | |
doc.metadata['chunk-id'] = str(uuid.uuid4())[-4:] | |
return docs_split |