hadirashwan
/

wear_what_clothing_suggestion

Text Generation

Model card Files Files and versions Community

wear_what_clothing_suggestion / setup_db.py

Hady Rashwan

wip

f962791 4 months ago

history blame contribute delete

1.99 kB

	import os
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from sentence_transformers import SentenceTransformer
	from supabase import create_client, Client

	# Load environment variables
	from dotenv import load_dotenv
	load_dotenv()

	# Initialize Supabase client
	supabase_url = os.getenv("SUPABASE_URL")
	supabase_key = os.getenv("SUPABASE_KEY")
	supabase: Client = create_client(supabase_url, supabase_key)

	# Initialize SentenceTransformer
	model = SentenceTransformer('mixedbread-ai/mxbai-embed-large-v1')

	def process_text_file(file_path: str):
	# Read the file
	with open(file_path, 'r') as file:
	text = file.read()

	# Split the text
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=10000,
	chunk_overlap=200,
	separators=["\n\n", "\n", ". ", " ", ""],
	keep_separator=False,
	length_function=len,
	)
	chunks = text_splitter.split_text(text)

	# Process each chunk
	for i, chunk in enumerate(chunks):
	# Further split the chunk into individual quotes
	quotes = chunk.split('\n')
	for j, quote in enumerate(quotes):
	# Skip empty lines
	if not quote.strip():
	continue

	# Create embedding
	embedding = model.encode(quote).tolist()

	# Extract quote number and text
	parts = quote.split('. ', 1)
	quote_number = parts[0] if len(parts) > 1 and parts[0].isdigit() else None
	quote_text = parts[1] if len(parts) > 1 else quote

	# Upload to Supabase
	supabase.table("quote_embeddings").insert({
	"chunk_id": f"{i}_{j}",
	"quote_number": quote_number,
	"quote_text": quote_text,
	"embedding": embedding
	}).execute()

	print(f"Processed and uploaded quotes to Supabase.")

	# Usage
	if __name__ == "__main__":
	file_path = "50_weather_quotes.txt"
	process_text_file(file_path)