nileshhanotia commited on
Commit
3b0f177
1 Parent(s): 8cd9024

Create rag_system.py

Browse files
Files changed (1) hide show
  1. rag_system.py +61 -0
rag_system.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ from langchain_huggingface import HuggingFaceEmbeddings
4
+ from langchain_community.vectorstores import FAISS
5
+ from langchain.text_splitter import CharacterTextSplitter
6
+ from langchain.docstore.document import Document
7
+ from transformers import pipeline
8
+ from langchain.prompts import PromptTemplate
9
+
10
+ class RAGSystem:
11
+ def __init__(self, csv_path="apparel.csv"):
12
+ self.setup_system(csv_path)
13
+ self.qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
14
+
15
+ def setup_system(self, csv_path):
16
+ if not os.path.exists(csv_path):
17
+ raise FileNotFoundError(f"CSV file not found at {csv_path}")
18
+
19
+ # Read the CSV file
20
+ documents = pd.read_csv(csv_path)
21
+
22
+ # Create proper Document objects
23
+ docs = [
24
+ Document(
25
+ page_content=str(row['Title']), # Convert to string to ensure compatibility
26
+ metadata={'index': idx}
27
+ )
28
+ for idx, row in documents.iterrows()
29
+ ]
30
+
31
+ # Split documents
32
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
33
+ split_docs = text_splitter.split_documents(docs)
34
+
35
+ # Create embeddings and vector store
36
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
37
+ self.vector_store = FAISS.from_documents(split_docs, embeddings)
38
+ self.retriever = self.vector_store.as_retriever()
39
+
40
+ def process_query(self, query):
41
+ # Retrieve documents based on the query
42
+ retrieved_docs = self.retriever.get_relevant_documents(query) # Changed from invoke to get_relevant_documents
43
+
44
+ # Properly access page_content from Document objects
45
+ retrieved_text = "\n".join([doc.page_content for doc in retrieved_docs])[:1000]
46
+
47
+ # Process with QA pipeline
48
+ qa_input = {
49
+ "question": query,
50
+ "context": retrieved_text
51
+ }
52
+ response = self.qa_pipeline(qa_input)
53
+
54
+ return response['answer']
55
+
56
+ def get_similar_documents(self, query, k=5):
57
+ """
58
+ Retrieve similar documents without processing through QA pipeline
59
+ """
60
+ docs = self.retriever.get_relevant_documents(query)
61
+ return [{'content': doc.page_content, 'metadata': doc.metadata} for doc in docs[:k]]