Spaces:
Build error
Build error
abishekcodes
commited on
Upload 7 files
Browse files- .gitattributes +2 -35
- .gitignore +152 -0
- LICENSE +21 -0
- README +56 -0
- app.py +85 -0
- htmltemp.py +44 -0
- requirements.txt +0 -0
.gitattributes
CHANGED
@@ -1,35 +1,2 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
1 |
+
# Auto detect text files and perform LF normalization
|
2 |
+
* text=auto
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
ADDED
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# poetry
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
102 |
+
#poetry.lock
|
103 |
+
|
104 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
105 |
+
__pypackages__/
|
106 |
+
|
107 |
+
# Celery stuff
|
108 |
+
celerybeat-schedule
|
109 |
+
celerybeat.pid
|
110 |
+
|
111 |
+
# SageMath parsed files
|
112 |
+
*.sage.py
|
113 |
+
|
114 |
+
# Environments
|
115 |
+
.env
|
116 |
+
.venv
|
117 |
+
env/
|
118 |
+
venv/
|
119 |
+
ENV/
|
120 |
+
env.bak/
|
121 |
+
venv.bak/
|
122 |
+
|
123 |
+
# Spyder project settings
|
124 |
+
.spyderproject
|
125 |
+
.spyproject
|
126 |
+
|
127 |
+
# Rope project settings
|
128 |
+
.ropeproject
|
129 |
+
|
130 |
+
# mkdocs documentation
|
131 |
+
/site
|
132 |
+
|
133 |
+
# mypy
|
134 |
+
.mypy_cache/
|
135 |
+
.dmypy.json
|
136 |
+
dmypy.json
|
137 |
+
|
138 |
+
# Pyre type checker
|
139 |
+
.pyre/
|
140 |
+
|
141 |
+
# pytype static type analyzer
|
142 |
+
.pytype/
|
143 |
+
|
144 |
+
# Cython debug symbols
|
145 |
+
cython_debug/
|
146 |
+
|
147 |
+
# PyCharm
|
148 |
+
# JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
|
149 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
150 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
151 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
152 |
+
#.idea/
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2023 Abishek M
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Langchain Based LLM Project
|
2 |
+
|
3 |
+
## Overview
|
4 |
+
|
5 |
+
This is a simple Langchain-based LLM project that allows users to upload multiple documents and ask questions through a web UI. The LLM has access to the knowledge contained within the uploaded documents and can provide answers to user's questions. This project can be improved in the future by creating a more user-friendly interface and enhancing its academic capabilities.
|
6 |
+
|
7 |
+
## Features
|
8 |
+
|
9 |
+
The following features are currently available in this project:
|
10 |
+
|
11 |
+
- Users can upload multiple PDF documents through the web UI.
|
12 |
+
- Users can ask questions related to the content of the uploaded documents.
|
13 |
+
- The LLM uses natural language processing and machine learning algorithms to understand and interpret user questions.
|
14 |
+
- The LLM provides answers to user questions based on the information contained within the uploaded documents.
|
15 |
+
- The LLM retains knowledge of past messages and documents to provide more accurate responses.
|
16 |
+
|
17 |
+
## Installation
|
18 |
+
|
19 |
+
To install this project, follow these steps:
|
20 |
+
|
21 |
+
1. Clone the repository using
|
22 |
+
|
23 |
+
`git clone https://github.com/abishek-ctrl/pdf-chat-langchain.git`
|
24 |
+
2. Install all dependencies by running
|
25 |
+
|
26 |
+
`pip install -r requirements.txt`
|
27 |
+
3. Add your OpenAI API key by creating a .env file.
|
28 |
+
|
29 |
+
OPENAI_API_KEY="your_openai_api_key"
|
30 |
+
3. Run the application using `python app.py`
|
31 |
+
|
32 |
+
# Usage
|
33 |
+
|
34 |
+
Once the project is installed, users can start using it by following these steps:
|
35 |
+
|
36 |
+
1. Upload one or more PDF documents using the "Upload" button on the web UI.
|
37 |
+
2. Ask a question related to the content of the uploaded documents using the "Ask Question" button on the web UI.
|
38 |
+
3. The LLM will process the question and provide an answer based on the information contained within the uploaded documents.
|
39 |
+
|
40 |
+
### Update (2024-01-20)
|
41 |
+
|
42 |
+
Added Streamlit URL for access: [https://lang-chat.streamlit.app](https://lang-chat.streamlit.app)
|
43 |
+
|
44 |
+
# Future Improvements
|
45 |
+
|
46 |
+
We plan to improve this project in the future by implementing the following features:
|
47 |
+
|
48 |
+
- A more user-friendly interface to make it easier for users to interact with the LLM.
|
49 |
+
- Deployment to the real world for public use.
|
50 |
+
- Enhanced academic capabilities to support students and researchers in their studies and work.
|
51 |
+
- Integration with additional document formats to expand the types of documents that can be used with the LLM.
|
52 |
+
- Advanced language models to improve the accuracy and comprehensiveness of the LLM's responses.
|
53 |
+
|
54 |
+
# License
|
55 |
+
|
56 |
+
This project is licensed under the MIT License.
|
app.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import os
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
from PyPDF2 import PdfReader
|
5 |
+
from langchain_mistralai.chat_models import ChatMistralAI
|
6 |
+
from langchain.text_splitter import CharacterTextSplitter
|
7 |
+
from langchain.vectorstores import FAISS
|
8 |
+
from langchain.embeddings import OpenAIEmbeddings
|
9 |
+
from langchain.memory import ConversationBufferMemory
|
10 |
+
from langchain.chains import ConversationalRetrievalChain
|
11 |
+
from htmltemp import css, botTemp, userTemp
|
12 |
+
|
13 |
+
def get_text(docs):
|
14 |
+
text= ""
|
15 |
+
for pdf in docs:
|
16 |
+
pdf_reader=PdfReader(pdf)
|
17 |
+
for page in pdf_reader.pages:
|
18 |
+
text+=page.extract_text()
|
19 |
+
return text
|
20 |
+
|
21 |
+
def get_chunks(rawtxt):
|
22 |
+
splitter=CharacterTextSplitter(
|
23 |
+
separator="\n",
|
24 |
+
chunk_size=1000,
|
25 |
+
chunk_overlap=200,
|
26 |
+
length_function=len
|
27 |
+
)
|
28 |
+
chunks=splitter.split_text(rawtxt)
|
29 |
+
return chunks
|
30 |
+
|
31 |
+
def get_vectorstore(chunks):
|
32 |
+
openai_api_key = os.environ.get('OPENAI_API_KEY')
|
33 |
+
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
|
34 |
+
vectors=FAISS.from_texts(texts=chunks,embedding=embeddings)
|
35 |
+
return vectors
|
36 |
+
|
37 |
+
def get_convo_chain(vector):
|
38 |
+
llm = ChatMistralAI() # Use ChatMistralAI instead of ChatOpenAI
|
39 |
+
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
|
40 |
+
convo_chain = ConversationalRetrievalChain.from_llm(
|
41 |
+
llm=llm,
|
42 |
+
retriever=vector.as_retriever(),
|
43 |
+
memory=memory
|
44 |
+
)
|
45 |
+
return convo_chain
|
46 |
+
|
47 |
+
def handle_inp(inp):
|
48 |
+
response=st.session_state.conversation({'question':inp})
|
49 |
+
st.session_state.chat_history= response['chat_history']
|
50 |
+
|
51 |
+
for i, msg in enumerate(st.session_state.chat_history):
|
52 |
+
if i%2==0:
|
53 |
+
st.write(userTemp.replace("{{MSG}}",msg.content),unsafe_allow_html=True)
|
54 |
+
else:
|
55 |
+
st.write(botTemp.replace("{{MSG}}",msg.content),unsafe_allow_html=True)
|
56 |
+
|
57 |
+
|
58 |
+
def main():
|
59 |
+
load_dotenv()
|
60 |
+
st.set_page_config(page_title="PDF Chat", page_icon=":books:")
|
61 |
+
|
62 |
+
st.write(css,unsafe_allow_html=True)
|
63 |
+
|
64 |
+
if "conversation" not in st.session_state:
|
65 |
+
st.session_state.conversation=None
|
66 |
+
if "chat_history" not in st.session_state:
|
67 |
+
st.session_state.chat_history=None
|
68 |
+
|
69 |
+
st.header("PDF Chat :books:")
|
70 |
+
inp= st.text_input("Ask a question about the PDF:")
|
71 |
+
if inp:
|
72 |
+
handle_inp(inp)
|
73 |
+
|
74 |
+
with st.sidebar:
|
75 |
+
st.subheader("Your PDFs")
|
76 |
+
docs=st.file_uploader("Upload your PDFs here",accept_multiple_files=True)
|
77 |
+
if st.button("Upload"):
|
78 |
+
with st.spinner("Processing"):
|
79 |
+
rawtxt=get_text(docs)
|
80 |
+
chunks=get_chunks(rawtxt)
|
81 |
+
vectors=get_vectorstore(chunks)
|
82 |
+
st.session_state.conversation=get_convo_chain(vectors)
|
83 |
+
|
84 |
+
if __name__ == "__main__":
|
85 |
+
main()
|
htmltemp.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
css = '''
|
2 |
+
<style>
|
3 |
+
.chat-message {
|
4 |
+
padding: 1.5rem; border-radius: 0.5rem; margin-bottom: 1rem; display: flex
|
5 |
+
}
|
6 |
+
.chat-message.user {
|
7 |
+
background-color: #2b313e
|
8 |
+
}
|
9 |
+
.chat-message.bot {
|
10 |
+
background-color: #475063
|
11 |
+
}
|
12 |
+
.chat-message .avatar {
|
13 |
+
width: 20%;
|
14 |
+
}
|
15 |
+
.chat-message .avatar img {
|
16 |
+
max-width: 78px;
|
17 |
+
max-height: 78px;
|
18 |
+
border-radius: 50%;
|
19 |
+
object-fit: cover;
|
20 |
+
}
|
21 |
+
.chat-message .message {
|
22 |
+
width: 80%;
|
23 |
+
padding: 0 1.5rem;
|
24 |
+
color: #fff;
|
25 |
+
}
|
26 |
+
'''
|
27 |
+
|
28 |
+
userTemp = '''
|
29 |
+
<div class="chat-message bot">
|
30 |
+
<div class="avatar">
|
31 |
+
<span>User:</span>
|
32 |
+
</div>
|
33 |
+
<div class="message">{{MSG}}</div>
|
34 |
+
</div>
|
35 |
+
'''
|
36 |
+
|
37 |
+
botTemp = '''
|
38 |
+
<div class="chat-message user">
|
39 |
+
<div class="avatar">
|
40 |
+
<span>Bot:</span>
|
41 |
+
</div>
|
42 |
+
<div class="message">{{MSG}}</div>
|
43 |
+
</div>
|
44 |
+
'''
|
requirements.txt
ADDED
Binary file (3.48 kB). View file
|
|