File size: 2,770 Bytes
0f50957 cd50977 0f50957 cd50977 0f50957 cd50977 0f50957 cd50977 0f50957 cd50977 0f50957 cd50977 0f50957 cd50977 0f50957 cd50977 0f50957 cd50977 0f50957 cd50977 0f50957 cd50977 0f50957 cd50977 0f50957 cd50977 0f50957 cd50977 0f50957 4e886c9 0f50957 4e886c9 0f50957 4e886c9 0f50957 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
import os
import streamlit as st
from huggingface_hub import HfApi
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA
# Set the working directory
working_dir = os.path.dirname(os.path.abspath((__file__)))
secret = os.getenv('GROQ_API_KEY')
# Loading the embedding model
embedding = HuggingFaceEmbeddings()
# Load the llm from Groq
llm = ChatGroq(
model="deepseek-r1-distill-llama-70b",
temperature=0
)
def process_document_to_chroma_db(file_name):
"""Process the document and load it into Chroma DB."""
# Load the document using unstructured PDF loader
loader = UnstructuredPDFLoader(f"{working_dir}/{file_name}")
documents = loader.load()
# Split the text into chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=2000,
chunk_overlap=200
)
texts = text_splitter.split_documents(documents)
# Load the documents into Chroma vectorstore
vectordb = Chroma.from_documents(
documents=texts,
embedding=embedding,
persist_directory=f"{working_dir}/doc_vectorstore"
)
return 0
def answer_question(user_question):
"""Answer the user's question using the trained model."""
# Load the persistent vectordb
vectordb = Chroma(
persist_directory=f"{working_dir}/doc_vectorstore",
embedding_function=embedding
)
# Retriever
retriever = vectordb.as_retriever()
# Create a chain to answer user question using DeepSeek-R1
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=retriever,
)
response = qa_chain.invoke({"query": user_question})
answer = response["result"]
return answer
# Streamlit interface
st.title("🐋 DeepSeek-R1 - Document RAG")
# File uploader widget
uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
if uploaded_file is not None:
# Define save path and save the uploaded file
save_path = os.path.join(working_dir, uploaded_file.name)
with open(save_path, "wb") as f:
f.write(uploaded_file.getbuffer())
# Process the document
process_document_to_chroma_db(uploaded_file.name)
st.info("Document Processed Successfully")
# Text widget to get user input
user_question = st.text_area("Ask your question about the document")
if st.button("Answer"):
# Answer the user's question
answer = answer_question(user_question)
# Display the response
st.markdown("### DeepSeek-R1 Response")
st.markdown(answer)
|