File size: 2,770 Bytes
0f50957
cd50977
0f50957
 
cd50977
0f50957
 
 
cd50977
0f50957
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd50977
 
0f50957
 
 
cd50977
 
0f50957
 
 
 
 
 
 
cd50977
0f50957
 
 
 
 
 
 
 
 
cd50977
0f50957
 
 
 
 
 
cd50977
0f50957
cd50977
 
0f50957
 
cd50977
0f50957
cd50977
 
0f50957
 
cd50977
0f50957
 
cd50977
0f50957
 
 
 
cd50977
 
0f50957
 
 
4e886c9
0f50957
 
4e886c9
0f50957
 
 
4e886c9
0f50957
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import os
import streamlit as st
from huggingface_hub import HfApi

from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA

# Set the working directory
working_dir = os.path.dirname(os.path.abspath((__file__)))

secret = os.getenv('GROQ_API_KEY')

# Loading the embedding model
embedding = HuggingFaceEmbeddings()

# Load the llm from Groq
llm = ChatGroq(
    model="deepseek-r1-distill-llama-70b",
    temperature=0
)


def process_document_to_chroma_db(file_name):
    """Process the document and load it into Chroma DB."""
    # Load the document using unstructured PDF loader
    loader = UnstructuredPDFLoader(f"{working_dir}/{file_name}")
    documents = loader.load()

    # Split the text into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000,
        chunk_overlap=200
    )
    texts = text_splitter.split_documents(documents)

    # Load the documents into Chroma vectorstore
    vectordb = Chroma.from_documents(
        documents=texts,
        embedding=embedding,
        persist_directory=f"{working_dir}/doc_vectorstore"
    )
    return 0


def answer_question(user_question):
    """Answer the user's question using the trained model."""
    # Load the persistent vectordb
    vectordb = Chroma(
        persist_directory=f"{working_dir}/doc_vectorstore",
        embedding_function=embedding
    )

    # Retriever
    retriever = vectordb.as_retriever()

    # Create a chain to answer user question using DeepSeek-R1
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
    )
    response = qa_chain.invoke({"query": user_question})
    answer = response["result"]

    return answer


# Streamlit interface
st.title("🐋 DeepSeek-R1 - Document RAG")

# File uploader widget
uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])

if uploaded_file is not None:
    # Define save path and save the uploaded file
    save_path = os.path.join(working_dir, uploaded_file.name)
    with open(save_path, "wb") as f:
        f.write(uploaded_file.getbuffer())

    # Process the document
    process_document_to_chroma_db(uploaded_file.name)
    st.info("Document Processed Successfully")

# Text widget to get user input
user_question = st.text_area("Ask your question about the document")

if st.button("Answer"):
    # Answer the user's question
    answer = answer_question(user_question)

    # Display the response
    st.markdown("### DeepSeek-R1 Response")
    st.markdown(answer)