Spaces:
Running
Running
from typing import List, TypedDict | |
from langchain_core.documents import Document | |
from langchain_openai import ChatOpenAI, OpenAIEmbeddings | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_qdrant import QdrantVectorStore | |
from qdrant_client import QdrantClient | |
from qdrant_client.http.models import Distance, VectorParams | |
from langchain_community.document_loaders import WebBaseLoader | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
import tiktoken | |
def load_web_documents(urls: List[str]) -> List[Document]: | |
""" | |
Load documents from web URLs | |
Args: | |
urls: List of URLs to load | |
Returns: | |
List of loaded documents | |
""" | |
loader = WebBaseLoader(urls) | |
return loader.load() | |
def create_rag_pipeline(collection_name: str = "rag_collection"): | |
# OpenAI embedding model | |
# embedding_model = OpenAIEmbeddings(model="text-embedding-3-small") | |
# Fine tuned embedding model | |
embedding_model = HuggingFaceEmbeddings( | |
model_name="ric9176/cjo-ft-v0", | |
) | |
# embedding_dim = 1536 # Dimension for text-embedding-3-small | |
embedding_dim = 1024 # Dimension for Snowflake/snowflake-arctic-embed-l | |
# Initialize Qdrant client (in-memory for development) | |
client = QdrantClient(":memory:") | |
# Create collection for vectors | |
client.create_collection( | |
collection_name=collection_name, | |
vectors_config=VectorParams(size=embedding_dim, distance=Distance.COSINE), | |
) | |
# Create vector store | |
vector_store = QdrantVectorStore( | |
client=client, | |
collection_name=collection_name, | |
embedding=embedding_model, | |
) | |
# Create text splitter for chunking | |
def tiktoken_len(text): | |
tokens = tiktoken.encoding_for_model("gpt-4o-mini").encode(text) | |
return len(tokens) | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=500, # Adjust based on your needs | |
chunk_overlap=50, | |
length_function=tiktoken_len, | |
) | |
# Create retriever | |
retriever = vector_store.as_retriever(search_kwargs={"k": 5}) | |
return { | |
"vector_store": vector_store, | |
"text_splitter": text_splitter, | |
"retriever": retriever | |
} | |
def add_documents(vector_store, text_splitter, documents: List[Document]): | |
""" | |
Add documents to the vector store | |
Args: | |
vector_store: The initialized vector store | |
text_splitter: The text splitter for chunking | |
documents: List of Document objects to add | |
""" | |
# Split documents into chunks | |
chunks = [] | |
for doc in documents: | |
# Split the page content of each document | |
doc_chunks = text_splitter.split_text(doc.page_content) | |
chunks.extend(doc_chunks) | |
# Add chunks to vector store | |
vector_store.add_texts(texts=chunks) | |
def add_urls_to_vectorstore(vector_store, text_splitter, urls: List[str]): | |
""" | |
Load documents from URLs and add them to the vector store | |
Args: | |
vector_store: The initialized vector store | |
text_splitter: The text splitter for chunking | |
urls: List of URLs to load and add | |
""" | |
# Load documents from URLs | |
documents = load_web_documents(urls) | |
# Add documents to vector store | |
add_documents(vector_store, text_splitter, documents) | |
def get_relevant_context(retriever, question: str) -> List[Document]: | |
""" | |
Get relevant context for a question | |
Args: | |
retriever: The initialized retriever | |
question: The question to find context for | |
Returns: | |
List of relevant documents | |
""" | |
return retriever.get_relevant_documents(question) |