Spaces:
Sleeping
Sleeping
import os | |
import pandas as pd | |
from langchain_huggingface import HuggingFaceEmbeddings | |
from langchain_community.vectorstores import FAISS | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain.docstore.document import Document | |
from transformers import pipeline | |
from langchain.prompts import PromptTemplate | |
class RAGSystem: | |
def __init__(self, csv_path="apparel.csv"): | |
self.setup_system(csv_path) | |
self.qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad") | |
def setup_system(self, csv_path): | |
if not os.path.exists(csv_path): | |
raise FileNotFoundError(f"CSV file not found at {csv_path}") | |
# Read the CSV file | |
documents = pd.read_csv(csv_path) | |
# Create proper Document objects | |
docs = [ | |
Document( | |
page_content=str(row['Title']), # Convert to string to ensure compatibility | |
metadata={'index': idx} | |
) | |
for idx, row in documents.iterrows() | |
] | |
# Split documents | |
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100) | |
split_docs = text_splitter.split_documents(docs) | |
# Create embeddings and vector store | |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
self.vector_store = FAISS.from_documents(split_docs, embeddings) | |
self.retriever = self.vector_store.as_retriever() | |
def process_query(self, query): | |
# Retrieve documents based on the query | |
retrieved_docs = self.retriever.get_relevant_documents(query) # Changed from invoke to get_relevant_documents | |
# Properly access page_content from Document objects | |
retrieved_text = "\n".join([doc.page_content for doc in retrieved_docs])[:1000] | |
# Process with QA pipeline | |
qa_input = { | |
"question": query, | |
"context": retrieved_text | |
} | |
response = self.qa_pipeline(qa_input) | |
return response['answer'] | |
def get_similar_documents(self, query, k=5): | |
""" | |
Retrieve similar documents without processing through QA pipeline | |
""" | |
docs = self.retriever.get_relevant_documents(query) | |
return [{'content': doc.page_content, 'metadata': doc.metadata} for doc in docs[:k]] |