File size: 2,428 Bytes
3b0f177
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
import pandas as pd
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain.docstore.document import Document
from transformers import pipeline
from langchain.prompts import PromptTemplate

class RAGSystem:
    def __init__(self, csv_path="apparel.csv"):
        self.setup_system(csv_path)
        self.qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
        
    def setup_system(self, csv_path):
        if not os.path.exists(csv_path):
            raise FileNotFoundError(f"CSV file not found at {csv_path}")
            
        # Read the CSV file
        documents = pd.read_csv(csv_path)
        
        # Create proper Document objects
        docs = [
            Document(
                page_content=str(row['Title']),  # Convert to string to ensure compatibility
                metadata={'index': idx}
            ) 
            for idx, row in documents.iterrows()
        ]
        
        # Split documents
        text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
        split_docs = text_splitter.split_documents(docs)
        
        # Create embeddings and vector store
        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
        self.vector_store = FAISS.from_documents(split_docs, embeddings)
        self.retriever = self.vector_store.as_retriever()
        
    def process_query(self, query):
        # Retrieve documents based on the query
        retrieved_docs = self.retriever.get_relevant_documents(query)  # Changed from invoke to get_relevant_documents
        
        # Properly access page_content from Document objects
        retrieved_text = "\n".join([doc.page_content for doc in retrieved_docs])[:1000]
        
        # Process with QA pipeline
        qa_input = {
            "question": query,
            "context": retrieved_text
        }
        response = self.qa_pipeline(qa_input)
        
        return response['answer']

    def get_similar_documents(self, query, k=5):
        """
        Retrieve similar documents without processing through QA pipeline
        """
        docs = self.retriever.get_relevant_documents(query)
        return [{'content': doc.page_content, 'metadata': doc.metadata} for doc in docs[:k]]