|
import os |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from sentence_transformers import SentenceTransformer |
|
from supabase import create_client, Client |
|
|
|
|
|
from dotenv import load_dotenv |
|
load_dotenv() |
|
|
|
|
|
supabase_url = os.getenv("SUPABASE_URL") |
|
supabase_key = os.getenv("SUPABASE_KEY") |
|
supabase: Client = create_client(supabase_url, supabase_key) |
|
|
|
|
|
model = SentenceTransformer('mixedbread-ai/mxbai-embed-large-v1') |
|
|
|
def process_text_file(file_path: str): |
|
|
|
with open(file_path, 'r') as file: |
|
text = file.read() |
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=10000, |
|
chunk_overlap=200, |
|
separators=["\n\n", "\n", ". ", " ", ""], |
|
keep_separator=False, |
|
length_function=len, |
|
) |
|
chunks = text_splitter.split_text(text) |
|
|
|
|
|
for i, chunk in enumerate(chunks): |
|
|
|
quotes = chunk.split('\n') |
|
for j, quote in enumerate(quotes): |
|
|
|
if not quote.strip(): |
|
continue |
|
|
|
|
|
embedding = model.encode(quote).tolist() |
|
|
|
|
|
parts = quote.split('. ', 1) |
|
quote_number = parts[0] if len(parts) > 1 and parts[0].isdigit() else None |
|
quote_text = parts[1] if len(parts) > 1 else quote |
|
|
|
|
|
supabase.table("quote_embeddings").insert({ |
|
"chunk_id": f"{i}_{j}", |
|
"quote_number": quote_number, |
|
"quote_text": quote_text, |
|
"embedding": embedding |
|
}).execute() |
|
|
|
print(f"Processed and uploaded quotes to Supabase.") |
|
|
|
|
|
if __name__ == "__main__": |
|
file_path = "50_weather_quotes.txt" |
|
process_text_file(file_path) |