abishekcodes commited on
Commit
6c14dd8
·
verified ·
1 Parent(s): fd0c545

Upload 7 files

Browse files
Files changed (7) hide show
  1. .gitattributes +2 -35
  2. .gitignore +152 -0
  3. LICENSE +21 -0
  4. README +56 -0
  5. app.py +85 -0
  6. htmltemp.py +44 -0
  7. requirements.txt +0 -0
.gitattributes CHANGED
@@ -1,35 +1,2 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ # Auto detect text files and perform LF normalization
2
+ * text=auto
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
105
+ __pypackages__/
106
+
107
+ # Celery stuff
108
+ celerybeat-schedule
109
+ celerybeat.pid
110
+
111
+ # SageMath parsed files
112
+ *.sage.py
113
+
114
+ # Environments
115
+ .env
116
+ .venv
117
+ env/
118
+ venv/
119
+ ENV/
120
+ env.bak/
121
+ venv.bak/
122
+
123
+ # Spyder project settings
124
+ .spyderproject
125
+ .spyproject
126
+
127
+ # Rope project settings
128
+ .ropeproject
129
+
130
+ # mkdocs documentation
131
+ /site
132
+
133
+ # mypy
134
+ .mypy_cache/
135
+ .dmypy.json
136
+ dmypy.json
137
+
138
+ # Pyre type checker
139
+ .pyre/
140
+
141
+ # pytype static type analyzer
142
+ .pytype/
143
+
144
+ # Cython debug symbols
145
+ cython_debug/
146
+
147
+ # PyCharm
148
+ # JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
149
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
150
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
151
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
152
+ #.idea/
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Abishek M
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Langchain Based LLM Project
2
+
3
+ ## Overview
4
+
5
+ This is a simple Langchain-based LLM project that allows users to upload multiple documents and ask questions through a web UI. The LLM has access to the knowledge contained within the uploaded documents and can provide answers to user's questions. This project can be improved in the future by creating a more user-friendly interface and enhancing its academic capabilities.
6
+
7
+ ## Features
8
+
9
+ The following features are currently available in this project:
10
+
11
+ - Users can upload multiple PDF documents through the web UI.
12
+ - Users can ask questions related to the content of the uploaded documents.
13
+ - The LLM uses natural language processing and machine learning algorithms to understand and interpret user questions.
14
+ - The LLM provides answers to user questions based on the information contained within the uploaded documents.
15
+ - The LLM retains knowledge of past messages and documents to provide more accurate responses.
16
+
17
+ ## Installation
18
+
19
+ To install this project, follow these steps:
20
+
21
+ 1. Clone the repository using
22
+
23
+ `git clone https://github.com/abishek-ctrl/pdf-chat-langchain.git`
24
+ 2. Install all dependencies by running
25
+
26
+ `pip install -r requirements.txt`
27
+ 3. Add your OpenAI API key by creating a .env file.
28
+
29
+ OPENAI_API_KEY="your_openai_api_key"
30
+ 3. Run the application using `python app.py`
31
+
32
+ # Usage
33
+
34
+ Once the project is installed, users can start using it by following these steps:
35
+
36
+ 1. Upload one or more PDF documents using the "Upload" button on the web UI.
37
+ 2. Ask a question related to the content of the uploaded documents using the "Ask Question" button on the web UI.
38
+ 3. The LLM will process the question and provide an answer based on the information contained within the uploaded documents.
39
+
40
+ ### Update (2024-01-20)
41
+
42
+ Added Streamlit URL for access: [https://lang-chat.streamlit.app](https://lang-chat.streamlit.app)
43
+
44
+ # Future Improvements
45
+
46
+ We plan to improve this project in the future by implementing the following features:
47
+
48
+ - A more user-friendly interface to make it easier for users to interact with the LLM.
49
+ - Deployment to the real world for public use.
50
+ - Enhanced academic capabilities to support students and researchers in their studies and work.
51
+ - Integration with additional document formats to expand the types of documents that can be used with the LLM.
52
+ - Advanced language models to improve the accuracy and comprehensiveness of the LLM's responses.
53
+
54
+ # License
55
+
56
+ This project is licensed under the MIT License.
app.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from dotenv import load_dotenv
4
+ from PyPDF2 import PdfReader
5
+ from langchain_mistralai.chat_models import ChatMistralAI
6
+ from langchain.text_splitter import CharacterTextSplitter
7
+ from langchain.vectorstores import FAISS
8
+ from langchain.embeddings import OpenAIEmbeddings
9
+ from langchain.memory import ConversationBufferMemory
10
+ from langchain.chains import ConversationalRetrievalChain
11
+ from htmltemp import css, botTemp, userTemp
12
+
13
+ def get_text(docs):
14
+ text= ""
15
+ for pdf in docs:
16
+ pdf_reader=PdfReader(pdf)
17
+ for page in pdf_reader.pages:
18
+ text+=page.extract_text()
19
+ return text
20
+
21
+ def get_chunks(rawtxt):
22
+ splitter=CharacterTextSplitter(
23
+ separator="\n",
24
+ chunk_size=1000,
25
+ chunk_overlap=200,
26
+ length_function=len
27
+ )
28
+ chunks=splitter.split_text(rawtxt)
29
+ return chunks
30
+
31
+ def get_vectorstore(chunks):
32
+ openai_api_key = os.environ.get('OPENAI_API_KEY')
33
+ embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
34
+ vectors=FAISS.from_texts(texts=chunks,embedding=embeddings)
35
+ return vectors
36
+
37
+ def get_convo_chain(vector):
38
+ llm = ChatMistralAI() # Use ChatMistralAI instead of ChatOpenAI
39
+ memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
40
+ convo_chain = ConversationalRetrievalChain.from_llm(
41
+ llm=llm,
42
+ retriever=vector.as_retriever(),
43
+ memory=memory
44
+ )
45
+ return convo_chain
46
+
47
+ def handle_inp(inp):
48
+ response=st.session_state.conversation({'question':inp})
49
+ st.session_state.chat_history= response['chat_history']
50
+
51
+ for i, msg in enumerate(st.session_state.chat_history):
52
+ if i%2==0:
53
+ st.write(userTemp.replace("{{MSG}}",msg.content),unsafe_allow_html=True)
54
+ else:
55
+ st.write(botTemp.replace("{{MSG}}",msg.content),unsafe_allow_html=True)
56
+
57
+
58
+ def main():
59
+ load_dotenv()
60
+ st.set_page_config(page_title="PDF Chat", page_icon=":books:")
61
+
62
+ st.write(css,unsafe_allow_html=True)
63
+
64
+ if "conversation" not in st.session_state:
65
+ st.session_state.conversation=None
66
+ if "chat_history" not in st.session_state:
67
+ st.session_state.chat_history=None
68
+
69
+ st.header("PDF Chat :books:")
70
+ inp= st.text_input("Ask a question about the PDF:")
71
+ if inp:
72
+ handle_inp(inp)
73
+
74
+ with st.sidebar:
75
+ st.subheader("Your PDFs")
76
+ docs=st.file_uploader("Upload your PDFs here",accept_multiple_files=True)
77
+ if st.button("Upload"):
78
+ with st.spinner("Processing"):
79
+ rawtxt=get_text(docs)
80
+ chunks=get_chunks(rawtxt)
81
+ vectors=get_vectorstore(chunks)
82
+ st.session_state.conversation=get_convo_chain(vectors)
83
+
84
+ if __name__ == "__main__":
85
+ main()
htmltemp.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ css = '''
2
+ <style>
3
+ .chat-message {
4
+ padding: 1.5rem; border-radius: 0.5rem; margin-bottom: 1rem; display: flex
5
+ }
6
+ .chat-message.user {
7
+ background-color: #2b313e
8
+ }
9
+ .chat-message.bot {
10
+ background-color: #475063
11
+ }
12
+ .chat-message .avatar {
13
+ width: 20%;
14
+ }
15
+ .chat-message .avatar img {
16
+ max-width: 78px;
17
+ max-height: 78px;
18
+ border-radius: 50%;
19
+ object-fit: cover;
20
+ }
21
+ .chat-message .message {
22
+ width: 80%;
23
+ padding: 0 1.5rem;
24
+ color: #fff;
25
+ }
26
+ '''
27
+
28
+ userTemp = '''
29
+ <div class="chat-message bot">
30
+ <div class="avatar">
31
+ <span>User:</span>
32
+ </div>
33
+ <div class="message">{{MSG}}</div>
34
+ </div>
35
+ '''
36
+
37
+ botTemp = '''
38
+ <div class="chat-message user">
39
+ <div class="avatar">
40
+ <span>Bot:</span>
41
+ </div>
42
+ <div class="message">{{MSG}}</div>
43
+ </div>
44
+ '''
requirements.txt ADDED
Binary file (3.48 kB). View file