pythonic-rag / rag.py
ric9176's picture
Update search_kwargs
e8aaad2
raw
history blame
3.68 kB
from typing import List, TypedDict
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
import tiktoken
def load_web_documents(urls: List[str]) -> List[Document]:
"""
Load documents from web URLs
Args:
urls: List of URLs to load
Returns:
List of loaded documents
"""
loader = WebBaseLoader(urls)
return loader.load()
def create_rag_pipeline(collection_name: str = "rag_collection"):
# OpenAI embedding model
# embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
# Fine tuned embedding model
embedding_model = HuggingFaceEmbeddings(
model_name="ric9176/cjo-ft-v0",
)
# embedding_dim = 1536 # Dimension for text-embedding-3-small
embedding_dim = 1024 # Dimension for Snowflake/snowflake-arctic-embed-l
# Initialize Qdrant client (in-memory for development)
client = QdrantClient(":memory:")
# Create collection for vectors
client.create_collection(
collection_name=collection_name,
vectors_config=VectorParams(size=embedding_dim, distance=Distance.COSINE),
)
# Create vector store
vector_store = QdrantVectorStore(
client=client,
collection_name=collection_name,
embedding=embedding_model,
)
# Create text splitter for chunking
def tiktoken_len(text):
tokens = tiktoken.encoding_for_model("gpt-4o-mini").encode(text)
return len(tokens)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500, # Adjust based on your needs
chunk_overlap=50,
length_function=tiktoken_len,
)
# Create retriever
retriever = vector_store.as_retriever(search_kwargs={"k": 5})
return {
"vector_store": vector_store,
"text_splitter": text_splitter,
"retriever": retriever
}
def add_documents(vector_store, text_splitter, documents: List[Document]):
"""
Add documents to the vector store
Args:
vector_store: The initialized vector store
text_splitter: The text splitter for chunking
documents: List of Document objects to add
"""
# Split documents into chunks
chunks = []
for doc in documents:
# Split the page content of each document
doc_chunks = text_splitter.split_text(doc.page_content)
chunks.extend(doc_chunks)
# Add chunks to vector store
vector_store.add_texts(texts=chunks)
def add_urls_to_vectorstore(vector_store, text_splitter, urls: List[str]):
"""
Load documents from URLs and add them to the vector store
Args:
vector_store: The initialized vector store
text_splitter: The text splitter for chunking
urls: List of URLs to load and add
"""
# Load documents from URLs
documents = load_web_documents(urls)
# Add documents to vector store
add_documents(vector_store, text_splitter, documents)
def get_relevant_context(retriever, question: str) -> List[Document]:
"""
Get relevant context for a question
Args:
retriever: The initialized retriever
question: The question to find context for
Returns:
List of relevant documents
"""
return retriever.get_relevant_documents(question)