Manasa1 commited on
Commit
0f50957
·
verified ·
1 Parent(s): 8d32e60

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -84
app.py CHANGED
@@ -1,106 +1,97 @@
1
- from dotenv import load_dotenv
2
  import streamlit as st
 
 
3
  from langchain_community.document_loaders import UnstructuredPDFLoader
4
- from langchain_text_splitters.character import CharacterTextSplitter
5
- from langchain_community.vectorstores import FAISS
6
- from langchain_community.embeddings import HuggingFaceEmbeddings
7
  from langchain_groq import ChatGroq
8
- from langchain.memory import ConversationBufferMemory
9
- from langchain.chains import ConversationalRetrievalChain
10
- import os
11
- import nltk
12
- nltk.download('punkt_tab')
13
- nltk.download('averaged_perceptron_tagger_eng')
14
- # Install Poppler and Tesseract in the runtime environment
15
- os.system("apt-get update && apt-get install -y poppler-utils tesseract-ocr")
16
-
17
- secret = os.getenv('Groq_api')
18
-
19
- working_dir = os.path.dirname(os.path.abspath(__file__))
20
-
21
- def load_documents(file_path):
22
- # Specify poppler_path and tesseract_path to ensure compatibility
23
- loader = UnstructuredPDFLoader(
24
- file_path,
25
- poppler_path="/usr/bin",
26
- tesseract_path="/usr/bin/tesseract"
27
- )
 
28
  documents = loader.load()
29
- return documents
30
 
31
- def setup_vectorstore(documents):
32
- embeddings = HuggingFaceEmbeddings()
33
- text_splitter = CharacterTextSplitter(
34
- separator="/n",
35
- chunk_size=1000,
36
  chunk_overlap=200
37
  )
38
- doc_chunks = text_splitter.split_documents(documents)
39
- vectorstores = FAISS.from_documents(doc_chunks, embeddings)
40
- return vectorstores
41
-
42
- def create_chain(vectorstores):
43
- llm = ChatGroq(
44
- api_key=secret,
45
- model="llama-3.1-8b-instant",
46
- temperature=0
47
  )
48
- retriever = vectorstores.as_retriever()
49
- memory = ConversationBufferMemory(
50
- llm=llm,
51
- output_key="answer",
52
- memory_key="chat_history",
53
- return_messages=True
 
 
 
54
  )
55
- chain = ConversationalRetrievalChain.from_llm(
 
 
 
 
 
56
  llm=llm,
 
57
  retriever=retriever,
58
- memory=memory,
59
- verbose=True
60
  )
61
- return chain
 
62
 
63
- # Streamlit page configuration
64
- st.set_page_config(
65
- page_title="Chat with your documents",
66
- page_icon="📑",
67
- layout="centered"
68
- )
69
 
70
- st.title("📝Chat With your docs 😎")
71
 
72
- # Initialize session states
73
- if "chat_history" not in st.session_state:
74
- st.session_state.chat_history = []
75
 
76
- uploaded_file = st.file_uploader(label="Upload your PDF")
 
77
 
78
- if uploaded_file:
79
- file_path = f"{working_dir}/{uploaded_file.name}"
80
- with open(file_path, "wb") as f:
 
81
  f.write(uploaded_file.getbuffer())
82
 
83
- if "vectorstores" not in st.session_state:
84
- st.session_state.vectorstores = setup_vectorstore(load_documents(file_path))
85
-
86
- if "conversation_chain" not in st.session_state:
87
- st.session_state.conversation_chain = create_chain(st.session_state.vectorstores)
88
-
89
- # Display chat history
90
- for message in st.session_state.chat_history:
91
- with st.chat_message(message["role"]):
92
- st.markdown(message["content"])
93
 
94
- # User input handling
95
- user_input = st.chat_input("Ask any questions relevant to uploaded pdf")
96
 
97
- if user_input:
98
- st.session_state.chat_history.append({"role": "user", "content": user_input})
99
- with st.chat_message("user"):
100
- st.markdown(user_input)
101
 
102
- with st.chat_message("assistant"):
103
- response = st.session_state.conversation_chain({"question": user_input})
104
- assistant_response = response["answer"]
105
- st.markdown(assistant_response)
106
- st.session_state.chat_history.append({"role": "assistant", "content": assistant_response})
 
1
+ import os
2
  import streamlit as st
3
+ from huggingface_hub import HfApi
4
+
5
  from langchain_community.document_loaders import UnstructuredPDFLoader
6
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
7
+ from langchain_huggingface import HuggingFaceEmbeddings
8
+ from langchain_chroma import Chroma
9
  from langchain_groq import ChatGroq
10
+ from langchain.chains import RetrievalQA
11
+
12
+ # Set the working directory
13
+ working_dir = os.path.dirname(os.path.abspath((__file__)))
14
+
15
+ secret = os.getenv('GROQ_API_KEY')
16
+
17
+ # Loading the embedding model
18
+ embedding = HuggingFaceEmbeddings()
19
+
20
+ # Load the llm from Groq
21
+ llm = ChatGroq(
22
+ model="deepseek-r1-distill-llama-70b",
23
+ temperature=0
24
+ )
25
+
26
+
27
+ def process_document_to_chroma_db(file_name):
28
+ """Process the document and load it into Chroma DB."""
29
+ # Load the document using unstructured PDF loader
30
+ loader = UnstructuredPDFLoader(f"{working_dir}/{file_name}")
31
  documents = loader.load()
 
32
 
33
+ # Split the text into chunks
34
+ text_splitter = RecursiveCharacterTextSplitter(
35
+ chunk_size=2000,
 
 
36
  chunk_overlap=200
37
  )
38
+ texts = text_splitter.split_documents(documents)
39
+
40
+ # Load the documents into Chroma vectorstore
41
+ vectordb = Chroma.from_documents(
42
+ documents=texts,
43
+ embedding=embedding,
44
+ persist_directory=f"{working_dir}/doc_vectorstore"
 
 
45
  )
46
+ return 0
47
+
48
+
49
+ def answer_question(user_question):
50
+ """Answer the user's question using the trained model."""
51
+ # Load the persistent vectordb
52
+ vectordb = Chroma(
53
+ persist_directory=f"{working_dir}/doc_vectorstore",
54
+ embedding_function=embedding
55
  )
56
+
57
+ # Retriever
58
+ retriever = vectordb.as_retriever()
59
+
60
+ # Create a chain to answer user question using DeepSeek-R1
61
+ qa_chain = RetrievalQA.from_chain_type(
62
  llm=llm,
63
+ chain_type="stuff",
64
  retriever=retriever,
 
 
65
  )
66
+ response = qa_chain.invoke({"query": user_question})
67
+ answer = response["result"]
68
 
69
+ return answer
 
 
 
 
 
70
 
 
71
 
72
+ # Streamlit interface
73
+ st.title("🐋 DeepSeek-R1 - Document RAG")
 
74
 
75
+ # File uploader widget
76
+ uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
77
 
78
+ if uploaded_file is not None:
79
+ # Define save path and save the uploaded file
80
+ save_path = os.path.join(working_dir, uploaded_file.name)
81
+ with open(save_path, "wb") as f:
82
  f.write(uploaded_file.getbuffer())
83
 
84
+ # Process the document
85
+ process_document_to_chroma_db(uploaded_file.name)
86
+ st.info("Document Processed Successfully")
 
 
 
 
 
 
 
87
 
88
+ # Text widget to get user input
89
+ user_question = st.text_area("Ask your question about the document")
90
 
91
+ if st.button("Answer"):
92
+ # Answer the user's question
93
+ answer = answer_question(user_question)
 
94
 
95
+ # Display the response
96
+ st.markdown("### DeepSeek-R1 Response")
97
+ st.markdown(answer)