Hady Rashwan
wip
f962791
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from supabase import create_client, Client
# Load environment variables
from dotenv import load_dotenv
load_dotenv()
# Initialize Supabase client
supabase_url = os.getenv("SUPABASE_URL")
supabase_key = os.getenv("SUPABASE_KEY")
supabase: Client = create_client(supabase_url, supabase_key)
# Initialize SentenceTransformer
model = SentenceTransformer('mixedbread-ai/mxbai-embed-large-v1')
def process_text_file(file_path: str):
# Read the file
with open(file_path, 'r') as file:
text = file.read()
# Split the text
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=10000,
chunk_overlap=200,
separators=["\n\n", "\n", ". ", " ", ""],
keep_separator=False,
length_function=len,
)
chunks = text_splitter.split_text(text)
# Process each chunk
for i, chunk in enumerate(chunks):
# Further split the chunk into individual quotes
quotes = chunk.split('\n')
for j, quote in enumerate(quotes):
# Skip empty lines
if not quote.strip():
continue
# Create embedding
embedding = model.encode(quote).tolist()
# Extract quote number and text
parts = quote.split('. ', 1)
quote_number = parts[0] if len(parts) > 1 and parts[0].isdigit() else None
quote_text = parts[1] if len(parts) > 1 else quote
# Upload to Supabase
supabase.table("quote_embeddings").insert({
"chunk_id": f"{i}_{j}",
"quote_number": quote_number,
"quote_text": quote_text,
"embedding": embedding
}).execute()
print(f"Processed and uploaded quotes to Supabase.")
# Usage
if __name__ == "__main__":
file_path = "50_weather_quotes.txt"
process_text_file(file_path)