File size: 2,425 Bytes
360f505 10ddae5 360f505 e0169c8 eeafaaa e0169c8 eeafaaa d7fdb42 360f505 eeafaaa 10ddae5 360f505 e0169c8 d7fdb42 eeafaaa d7fdb42 10ddae5 e0169c8 d7fdb42 e0169c8 d7fdb42 e0169c8 d7fdb42 e0169c8 d7fdb42 360f505 d7fdb42 10ddae5 eeafaaa d7fdb42 10ddae5 d7fdb42 eeafaaa 10ddae5 d7fdb42 10ddae5 eeafaaa 10ddae5 d7fdb42 10ddae5 d7fdb42 10ddae5 d7fdb42 360f505 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import shutil
import time
import lancedb
import openai
import pyarrow as pa
import pandas as pd
from pathlib import Path
import tqdm
import numpy as np
from gradio_app.backend.embedders import EmbedderFactory
from markdown_to_text import *
from settings import *
with open('data/openaikey.txt') as f:
OPENAI_KEY = f.read().strip()
openai.api_key = OPENAI_KEY
# shutil.rmtree(LANCEDB_DIRECTORY, ignore_errors=True)
db = lancedb.connect(LANCEDB_DIRECTORY)
batch_size = 32
schema = pa.schema([
pa.field(VECTOR_COLUMN_NAME, pa.list_(pa.float32(), emb_sizes[EMBED_NAME])),
pa.field(TEXT_COLUMN_NAME, pa.string()),
pa.field(DOCUMENT_PATH_COLUMN_NAME, pa.string()),
])
table_name = f'{LANCEDB_TABLE_NAME}_{CHUNK_POLICY}_{EMBED_NAME}'
tbl = db.create_table(table_name, schema=schema, mode="overwrite")
input_dir = Path(MARKDOWN_SOURCE_DIR)
files = list(input_dir.rglob("*"))
chunks = []
for file in files:
if not os.path.isfile(file):
continue
file_path, file_ext = os.path.splitext(os.path.relpath(file, input_dir))
if file_ext != '.md':
print(f'Skipped {file_ext} extension: {file}')
continue
with open(file, encoding='utf-8') as f:
f = f.read()
f = remove_comments(f)
if CHUNK_POLICY == "txt":
f = md2txt_then_split(f)
else:
assert CHUNK_POLICY == "md"
f = split_markdown(f)
chunks.extend((chunk, os.path.abspath(file)) for chunk in f)
from matplotlib import pyplot as plt
plt.hist([len(c) for c, d in chunks], bins=100)
plt.title(table_name)
plt.show()
embedder = EmbedderFactory.get_embedder(EMBED_NAME)
time_embed, time_ingest = [], []
for i in tqdm.tqdm(range(0, int(np.ceil(len(chunks) / batch_size)))):
texts, doc_paths = [], []
for text, doc_path in chunks[i * batch_size:(i + 1) * batch_size]:
if len(text) > 0:
texts.append(text)
doc_paths.append(doc_path)
t = time.perf_counter()
encoded = embedder.embed(texts)
time_embed.append(time.perf_counter() - t)
df = pd.DataFrame({
VECTOR_COLUMN_NAME: encoded,
TEXT_COLUMN_NAME: texts,
DOCUMENT_PATH_COLUMN_NAME: doc_paths,
})
t = time.perf_counter()
tbl.add(df)
time_ingest.append(time.perf_counter() - t)
time_embed = sum(time_embed)
time_ingest = sum(time_ingest)
print(f'Embedding: {time_embed}, Ingesting: {time_ingest}')
|