Spaces:
Sleeping
Sleeping
nileshhanotia
commited on
Commit
•
3b0f177
1
Parent(s):
8cd9024
Create rag_system.py
Browse files- rag_system.py +61 -0
rag_system.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pandas as pd
|
3 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
4 |
+
from langchain_community.vectorstores import FAISS
|
5 |
+
from langchain.text_splitter import CharacterTextSplitter
|
6 |
+
from langchain.docstore.document import Document
|
7 |
+
from transformers import pipeline
|
8 |
+
from langchain.prompts import PromptTemplate
|
9 |
+
|
10 |
+
class RAGSystem:
|
11 |
+
def __init__(self, csv_path="apparel.csv"):
|
12 |
+
self.setup_system(csv_path)
|
13 |
+
self.qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
|
14 |
+
|
15 |
+
def setup_system(self, csv_path):
|
16 |
+
if not os.path.exists(csv_path):
|
17 |
+
raise FileNotFoundError(f"CSV file not found at {csv_path}")
|
18 |
+
|
19 |
+
# Read the CSV file
|
20 |
+
documents = pd.read_csv(csv_path)
|
21 |
+
|
22 |
+
# Create proper Document objects
|
23 |
+
docs = [
|
24 |
+
Document(
|
25 |
+
page_content=str(row['Title']), # Convert to string to ensure compatibility
|
26 |
+
metadata={'index': idx}
|
27 |
+
)
|
28 |
+
for idx, row in documents.iterrows()
|
29 |
+
]
|
30 |
+
|
31 |
+
# Split documents
|
32 |
+
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
33 |
+
split_docs = text_splitter.split_documents(docs)
|
34 |
+
|
35 |
+
# Create embeddings and vector store
|
36 |
+
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
37 |
+
self.vector_store = FAISS.from_documents(split_docs, embeddings)
|
38 |
+
self.retriever = self.vector_store.as_retriever()
|
39 |
+
|
40 |
+
def process_query(self, query):
|
41 |
+
# Retrieve documents based on the query
|
42 |
+
retrieved_docs = self.retriever.get_relevant_documents(query) # Changed from invoke to get_relevant_documents
|
43 |
+
|
44 |
+
# Properly access page_content from Document objects
|
45 |
+
retrieved_text = "\n".join([doc.page_content for doc in retrieved_docs])[:1000]
|
46 |
+
|
47 |
+
# Process with QA pipeline
|
48 |
+
qa_input = {
|
49 |
+
"question": query,
|
50 |
+
"context": retrieved_text
|
51 |
+
}
|
52 |
+
response = self.qa_pipeline(qa_input)
|
53 |
+
|
54 |
+
return response['answer']
|
55 |
+
|
56 |
+
def get_similar_documents(self, query, k=5):
|
57 |
+
"""
|
58 |
+
Retrieve similar documents without processing through QA pipeline
|
59 |
+
"""
|
60 |
+
docs = self.retriever.get_relevant_documents(query)
|
61 |
+
return [{'content': doc.page_content, 'metadata': doc.metadata} for doc in docs[:k]]
|