ragtest-sakimilo / preprocess_raw_documents.py
lingyit1108's picture
swap to new embedding model and handle user 'i dont know' scenario
8c107a7
raw
history blame
546 Bytes
import os
import shutil
from tqdm import tqdm
def split_content(filepath, separator, tmp_folder):
os.makedirs(tmp_folder, exist_ok=True)
base_file_name = os.path.basename(filepath)
fname, fextn = base_file_name.split(".")
with open(filepath, "r") as fp:
content = fp.read()
content_chunk = content.split(separator)
for index, chunk in tqdm(enumerate(content_chunk)):
new_fpath = os.path.join(tmp_folder, f"{fname}_{index:03d}.{fextn}")
with open(new_fpath, "w") as fp:
fp.write(chunk)