Spaces:
No application file
No application file
Priyanka-Balivada
commited on
Commit
β’
6cc785b
1
Parent(s):
73d84aa
Upload 2 files
Browse files- docapp.py +72 -0
- requirements.txt +81 -0
docapp.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from langchain.text_splitter import CharacterTextSplitter
|
3 |
+
from langchain.docstore.document import Document
|
4 |
+
from langchain.chains.summarize import load_summarize_chain
|
5 |
+
from langchain_community.llms import CTransformers
|
6 |
+
from langchain.callbacks.manager import CallbackManager
|
7 |
+
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
8 |
+
from pypdf import PdfReader
|
9 |
+
|
10 |
+
# Page title
|
11 |
+
st.set_page_config(page_title='π¦π Text Summarization App')
|
12 |
+
st.title('π¦π Text Summarization App')
|
13 |
+
|
14 |
+
# Function to read all PDF files and return text
|
15 |
+
def get_pdf_text(pdf_docs):
|
16 |
+
text = ""
|
17 |
+
for pdf in pdf_docs:
|
18 |
+
pdf_reader = PdfReader(pdf)
|
19 |
+
for page in pdf_reader.pages:
|
20 |
+
text += page.extract_text()
|
21 |
+
return text
|
22 |
+
|
23 |
+
# Function to split the text into smaller chunks and convert it into document format
|
24 |
+
def chunks_and_document(txt):
|
25 |
+
text_splitter = CharacterTextSplitter()
|
26 |
+
texts = text_splitter.split_text(txt)
|
27 |
+
docs = [Document(page_content=t) for t in texts]
|
28 |
+
return docs
|
29 |
+
|
30 |
+
# Loading the Llama 2's LLM
|
31 |
+
def load_llm():
|
32 |
+
# We instantiate the callback with a streaming stdout handler
|
33 |
+
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
|
34 |
+
|
35 |
+
# Loading the LLM model
|
36 |
+
llm = CTransformers(
|
37 |
+
model="llama-2-7b-chat.ggmlv3.q2_K.bin",
|
38 |
+
model_type="llama",
|
39 |
+
config={'max_new_tokens': 600,
|
40 |
+
'temperature': 0.5,
|
41 |
+
'context_length': 700}
|
42 |
+
)
|
43 |
+
|
44 |
+
return llm
|
45 |
+
|
46 |
+
# Function to apply the LLM model with our document
|
47 |
+
def chains_and_response(docs):
|
48 |
+
llm = load_llm()
|
49 |
+
chain = load_summarize_chain(llm, chain_type='map_reduce')
|
50 |
+
return chain.invoke(docs)
|
51 |
+
|
52 |
+
def main():
|
53 |
+
# Initialize messages if not already present
|
54 |
+
if "messages" not in st.session_state.keys():
|
55 |
+
st.session_state.messages = []
|
56 |
+
|
57 |
+
# Sidebar for uploading PDF files
|
58 |
+
with st.sidebar:
|
59 |
+
st.title("Menu:")
|
60 |
+
pdf_docs = st.file_uploader(
|
61 |
+
"Upload your PDF Files and Click on the Submit & Process Button", accept_multiple_files=True
|
62 |
+
)
|
63 |
+
if st.button("Submit & Process"):
|
64 |
+
with st.spinner("Processing..."):
|
65 |
+
txt_input = get_pdf_text(pdf_docs)
|
66 |
+
docs = chunks_and_document(txt_input)
|
67 |
+
response = chains_and_response(docs)
|
68 |
+
st.title('πβ
Summarization Result')
|
69 |
+
for res in response:
|
70 |
+
st.info(res)
|
71 |
+
|
72 |
+
main()
|
requirements.txt
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiohttp==3.9.3
|
2 |
+
aiosignal==1.3.1
|
3 |
+
altair==5.2.0
|
4 |
+
annotated-types==0.6.0
|
5 |
+
anyio==4.2.0
|
6 |
+
async-timeout==4.0.3
|
7 |
+
attrs==23.2.0
|
8 |
+
blinker==1.7.0
|
9 |
+
cachetools==5.3.2
|
10 |
+
certifi==2024.2.2
|
11 |
+
charset-normalizer==3.3.2
|
12 |
+
click==8.1.7
|
13 |
+
colorama==0.4.6
|
14 |
+
ctransformers==0.2.27
|
15 |
+
dataclasses-json==0.6.4
|
16 |
+
exceptiongroup==1.2.0
|
17 |
+
filelock==3.13.1
|
18 |
+
frozenlist==1.4.1
|
19 |
+
fsspec==2024.2.0
|
20 |
+
gitdb==4.0.11
|
21 |
+
GitPython==3.1.41
|
22 |
+
greenlet==3.0.3
|
23 |
+
huggingface-hub==0.20.3
|
24 |
+
idna==3.6
|
25 |
+
importlib-metadata==7.0.1
|
26 |
+
Jinja2==3.1.3
|
27 |
+
jsonpatch==1.33
|
28 |
+
jsonpointer==2.4
|
29 |
+
jsonschema==4.21.1
|
30 |
+
jsonschema-specifications==2023.12.1
|
31 |
+
langchain==0.1.6
|
32 |
+
langchain-community==0.0.19
|
33 |
+
langchain-core==0.1.22
|
34 |
+
langsmith==0.0.87
|
35 |
+
markdown-it-py==3.0.0
|
36 |
+
MarkupSafe==2.1.5
|
37 |
+
marshmallow==3.20.2
|
38 |
+
mdurl==0.1.2
|
39 |
+
multidict==6.0.5
|
40 |
+
mypy-extensions==1.0.0
|
41 |
+
numpy==1.26.4
|
42 |
+
packaging==23.2
|
43 |
+
pandas==2.2.0
|
44 |
+
pillow==10.2.0
|
45 |
+
protobuf==4.25.2
|
46 |
+
py-cpuinfo==9.0.0
|
47 |
+
pyarrow==15.0.0
|
48 |
+
pydantic==2.6.1
|
49 |
+
pydantic_core==2.16.2
|
50 |
+
pydeck==0.8.1b0
|
51 |
+
Pygments==2.17.2
|
52 |
+
python-dateutil==2.8.2
|
53 |
+
pytz==2024.1
|
54 |
+
PyYAML==6.0.1
|
55 |
+
referencing==0.33.0
|
56 |
+
regex==2023.12.25
|
57 |
+
requests==2.31.0
|
58 |
+
rich==13.7.0
|
59 |
+
rpds-py==0.17.1
|
60 |
+
safetensors==0.4.2
|
61 |
+
six==1.16.0
|
62 |
+
smmap==5.0.1
|
63 |
+
sniffio==1.3.0
|
64 |
+
SQLAlchemy==2.0.25
|
65 |
+
streamlit==1.31.0
|
66 |
+
tenacity==8.2.3
|
67 |
+
tokenizers==0.15.1
|
68 |
+
toml==0.10.2
|
69 |
+
toolz==0.12.1
|
70 |
+
tornado==6.4
|
71 |
+
tqdm==4.66.2
|
72 |
+
transformers==4.37.2
|
73 |
+
typing-inspect==0.9.0
|
74 |
+
typing_extensions==4.9.0
|
75 |
+
tzdata==2023.4
|
76 |
+
tzlocal==5.2
|
77 |
+
urllib3==2.2.0
|
78 |
+
validators==0.22.0
|
79 |
+
watchdog==4.0.0
|
80 |
+
yarl==1.9.4
|
81 |
+
zipp==3.17.0
|