import shutil |
import time |
import lancedb |
import openai |
import pyarrow as pa |
import pandas as pd |
from pathlib import Path |
import tqdm |
import numpy as np |
from gradio_app.backend.embedders import EmbedderFactory |
from markdown_to_text import * |
from settings import * |
with open('data/openaikey.txt') as f: |
OPENAI_KEY = f.read().strip() |
openai.api_key = OPENAI_KEY |
db = lancedb.connect(LANCEDB_DIRECTORY) |
batch_size = 32 |
schema = pa.schema([ |
pa.field(VECTOR_COLUMN_NAME, pa.list_(pa.float32(), emb_sizes[EMBED_NAME])), |
pa.field(TEXT_COLUMN_NAME, pa.string()), |
pa.field(DOCUMENT_PATH_COLUMN_NAME, pa.string()), |
]) |
tbl = db.create_table(table_name, schema=schema, mode="overwrite") |
input_dir = Path(MARKDOWN_SOURCE_DIR) |
files = list(input_dir.rglob("*")) |
chunks = [] |
for file in files: |
if not os.path.isfile(file): |
continue |
file_path, file_ext = os.path.splitext(os.path.relpath(file, input_dir)) |
if file_ext != '.md': |
print(f'Skipped {file_ext} extension: {file}') |
continue |
with open(file, encoding='utf-8') as f: |
f = f.read() |
f = remove_comments(f) |
if CHUNK_POLICY == "txt": |
f = md2txt_then_split(f) |
else: |
assert CHUNK_POLICY == "md" |
f = split_markdown(f) |
chunks.extend((chunk, os.path.abspath(file)) for chunk in f) |
from matplotlib import pyplot as plt |
plt.hist([len(c) for c, d in chunks], bins=100) |
plt.title(table_name) |
plt.show() |
embedder = EmbedderFactory.get_embedder(EMBED_NAME) |
time_embed, time_ingest = [], [] |
for i in tqdm.tqdm(range(0, int(np.ceil(len(chunks) / batch_size)))): |
texts, doc_paths = [], [] |
for text, doc_path in chunks[i * batch_size:(i + 1) * batch_size]: |
if len(text) > 0: |
texts.append(text) |
doc_paths.append(doc_path) |
t = time.perf_counter() |
encoded = embedder.embed(texts) |
time_embed.append(time.perf_counter() - t) |
df = pd.DataFrame({ |
}) |
t = time.perf_counter() |
tbl.add(df) |
time_ingest.append(time.perf_counter() - t) |
time_embed = sum(time_embed) |
time_ingest = sum(time_ingest) |
print(f'Embedding: {time_embed}, Ingesting: {time_ingest}') |