Alanturner2 commited on
Commit
69cbfa2
·
verified ·
1 Parent(s): 2c8ac87

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +229 -96
app.py CHANGED
@@ -1,113 +1,246 @@
1
- import os
2
- from langchain.document_loaders import PyPDFLoader
3
- from langchain import PromptTemplate
4
- from langchain.text_splitter import RecursiveCharacterTextSplitter
 
5
  from langchain.chains.summarize import load_summarize_chain
6
- from langchain.chat_models import ChatOpenAI
 
7
 
 
 
8
 
9
- def setup_documents(pdf_file_path,chunk_size,chunk_overlap):
10
- loader = PyPDFLoader(pdf_file_path)
11
- docs_raw = loader.load()
12
- docs_raw_text = [doc.page_content for doc in docs_raw]
13
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
14
- chunk_overlap=chunk_overlap)
15
- docs = text_splitter.create_documents(docs_raw_text)
16
-
17
- return docs
18
-
 
 
 
 
 
 
 
 
19
 
 
 
 
 
 
 
20
 
21
- def custom_summary(docs, llm, custom_prompt, chain_type, num_summaries):
22
- custom_prompt = custom_prompt + """:\n {text}"""
23
- COMBINE_PROMPT = PromptTemplate(template=custom_prompt, input_variables = ["text"])
24
- MAP_PROMPT = PromptTemplate(template="Summarize:\n{text}", input_variables=["text"])
25
- if chain_type == "map_reduce":
26
- chain = load_summarize_chain(llm,chain_type=chain_type,
27
- map_prompt=MAP_PROMPT,
28
- combine_prompt=COMBINE_PROMPT)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  else:
30
- chain = load_summarize_chain(llm,chain_type=chain_type)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  summaries = []
33
- for i in range(num_summaries):
34
- summary_output = chain({"input_documents": docs}, return_only_outputs=True)["output_text"]
35
- summaries.append(summary_output)
36
-
 
 
 
 
 
 
 
 
 
37
  return summaries
38
 
39
 
40
- def color_chunks(text: str, chunk_size: int, overlap_size: int) -> str:
41
- overlap_color = "#808080"
42
- chunk_colors = ["#a8d08d", "#c6dbef", "#e6550d", "#fd8d3c", "#fdae6b", "#fdd0a2"] # Different shades of green for chunks
43
-
44
- colored_text = ""
45
- overlap = ""
46
- color_index = 0
47
- for i in range(0, len(text), chunk_size-overlap_size):
48
- chunk = text[i:i+chunk_size]
49
- if overlap:
50
- colored_text += f'<mark style="background-color: {overlap_color};">{overlap}</mark>'
51
- chunk = chunk[len(overlap):]
52
- colored_text += f'<mark style="background-color: {chunk_colors[color_index]};">{chunk}</mark>'
53
- color_index = (color_index + 1) % len(chunk_colors)
54
- overlap = text[i+chunk_size-overlap_size:i+chunk_size]
55
-
56
- return colored_text
57
-
58
-
59
- def main():
60
- st.set_page_config(layout="wide")
61
- st.title("Custom Summarization App")
62
- llm = st.sidebar.selectbox("LLM",["ChatGPT", "GPT4", "Other (open source in the future)"])
63
- chain_type = st.sidebar.selectbox("Chain Type", ["map_reduce", "stuff", "refine"])
64
- chunk_size = st.sidebar.slider("Chunk Size", min_value=20, max_value = 10000,
65
- step=10, value=2000)
66
- chunk_overlap = st.sidebar.slider("Chunk Overlap", min_value=5, max_value = 5000,
67
- step=10, value=200)
68
 
69
- if st.sidebar.checkbox("Debug chunk size"):
70
- st.header("Interactive Text Chunk Visualization")
 
71
 
72
- text_input = st.text_area("Input Text", "This is a test text to showcase the functionality of the interactive text chunk visualizer.")
 
 
 
 
 
73
 
74
- # Set the minimum to 1, the maximum to 5000 and default to 100
75
- html_code = color_chunks(text_input, chunk_size, chunk_overlap)
76
- st.markdown(html_code, unsafe_allow_html=True)
77
 
78
- else:
79
- user_prompt = st.text_input("Enter the custom summary prompt")
80
- pdf_file_path = st.text_input("Enther the pdf file path")
81
-
82
- temperature = st.sidebar.number_input("Set the ChatGPT Temperature",
83
- min_value = 0.0,
84
- max_value=1.0,
85
- step=0.1,
86
- value=0.5)
87
- num_summaries = st.sidebar.number_input("Number of summaries",
88
- min_value = 1,
89
- max_value = 10,
90
- step = 1,
91
- value=1)
92
- if pdf_file_path != "":
93
- docs = setup_documents(pdf_file_path, chunk_size, chunk_overlap)
94
- st.write("PDF loaded successfully")
95
-
96
- if llm=="ChatGPT":
97
- llm = ChatOpenAI(temperature=temperature)
98
- elif llm=="GPT4":
99
- llm = ChatOpenAI(model_name="gpt-4",temperature=temperature)
100
- else:
101
- st.write("Using ChatGPT while open source models are not implemented!")
102
- llm = ChatOpenAI(temperature=temperature)
103
-
104
- if st.button("Summarize"):
105
- result = custom_summary(docs, llm, user_prompt, chain_type, num_summaries)
106
- st.write("Summary:")
107
- for summary in result:
108
- st.write(summary)
109
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
- if __name__=="__main__":
112
- main()
 
 
 
 
 
 
 
 
 
 
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from langchain.document_loaders import ArxivLoader
3
+ from PyPDF2 import PdfReader
4
+ from langchain_community.llms import HuggingFaceHub
5
+ from langchain.text_splitter import TokenTextSplitter
6
  from langchain.chains.summarize import load_summarize_chain
7
+ from langchain.document_loaders import PyPDFLoader
8
+ from transformers import pipeline
9
 
10
+ from dotenv import load_dotenv
11
+ import os
12
 
13
+ load_dotenv()
14
+ hugging_api_key = os.getenv('HUGGING_API_KEY')
15
+
16
+ from groq import AsyncGroq
17
+ from groq import Groq
18
+
19
+ from langchain_groq import ChatGroq
20
+ from langchain.document_loaders import ArxivLoader
21
+ from langchain.vectorstores import Chroma
22
+ from langchain.chains import RetrievalQA
23
+ from langchain.embeddings.huggingface_hub import HuggingFaceHubEmbeddings
24
+ from huggingface_hub import login
25
+ login(hugging_api_key)
26
+ embedding_model = HuggingFaceHubEmbeddings(huggingfacehub_api_token=hugging_api_key)
27
+ llm = ChatGroq(temperature=0, model_name="llama3-70b-8192", api_key = "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")
28
+
29
+ def display_results(result):
30
+ return "\n".join(result) # Join each entry with double newlines for better readability
31
 
32
+ def summarize_pdf(pdf_file_path, max_length):
33
+ summarizer = pipeline('summarization', model='allenai/led-large-16384-arxiv', min_length=100, max_length=max_length, device=0)
34
+ loader = PdfReader(pdf_file_path)
35
+ text = """ """
36
+ for page in loader.pages:
37
+ text += page.extract_text()
38
 
39
+ text_splitter = TokenTextSplitter(chunk_size=8192, chunk_overlap=1000)
40
+ chunks = text_splitter.split_text(text)
41
+ summary = ""
42
+ for i in range(len(chunks)):
43
+ text = chunks[i].page_content
44
+ summary += summarizer(text)[0]['summary_text']
45
+ # summary = str(max_length)
46
+ return summary
47
+
48
+ def summarize_text(text):
49
+ sum_client = Groq(api_key="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")
50
+ messages = []
51
+ # messages.append({"role": "system", "content": "You are arxiv paper summarizer. If I give you the doi number, you should only output summarization. Summarization should be more than 10% words of the paper. For example, in the paper there are 500 words, than summarization should be more than 50 words."})
52
+ messages.append({"role": "system", "content": "You are summarizer. If I give you the whole text you should summarize it. And you don't need the title and author"})
53
+ messages = messages + [
54
+ {
55
+ "role": "user",
56
+ "content": f"Summarize the paper. The whole text is {text}",
57
+ },
58
+ ]
59
+ response = sum_client.chat.completions.create(
60
+ messages=messages,
61
+ model="llama3-70b-8192",
62
+ temperature=0,
63
+ max_tokens=8192,
64
+ top_p=1,
65
+ stop=None
66
+ )
67
+ text_summary = response.choices[0].message.content
68
+ return text_summary
69
+
70
+
71
+
72
+
73
+ def remove_first_sentence_and_title(text):
74
+ # Remove the first sentence
75
+ first_sentence_end = text.find('. ') + 2 # Find the end of the first sentence
76
+ text_without_first_sentence = text[first_sentence_end:]
77
+
78
+ # Remove the title
79
+ title_start = text_without_first_sentence.find('**Title:**')
80
+ if title_start != -1:
81
+ title_end = text_without_first_sentence.find('\n', title_start)
82
+ if title_end != -1:
83
+ text_without_title = text_without_first_sentence[:title_start] + text_without_first_sentence[title_end+1:]
84
+ else:
85
+ text_without_title = text_without_first_sentence[:title_start]
86
  else:
87
+ text_without_title = text_without_first_sentence
88
+
89
+ return text_without_title.strip()
90
+
91
+
92
+
93
+ def summarize_arxiv_pdf(query):
94
+ loader = ArxivLoader(query=query, load_max_docs=10)
95
+ documents = loader.load()
96
+ text_splitter = TokenTextSplitter(chunk_size=5700, chunk_overlap=100)
97
+ chunks = text_splitter.split_documents(documents)
98
+
99
+ text = documents[0].page_content
100
+
101
+
102
+ ref_summary = ""
103
+ for i in range(len(chunks)):
104
+ text = chunks[i].page_content
105
+ ref_summary += summarize_text(text)
106
+ # ref_summary = ref_summary.split('paper:')[1]
107
+ # ref_summary = remove_first_sentence_and_title(ref_summary)
108
+ ref_summary = ref_summary.replace("Here is a summary of the paper:", "").strip()
109
+ arxiv_summary = loader.get_summaries_as_docs()
110
 
111
  summaries = []
112
+ for doc in arxiv_summary:
113
+ title = doc.metadata.get("Title")
114
+ authors = doc.metadata.get("Authors")
115
+ url = doc.metadata.get("Entry ID")
116
+ summary = doc.page_content
117
+ summaries.append(f"**{title}**\n")
118
+ summaries.append(f"**Authors:** {authors}\n")
119
+ summaries.append(f"**View full paper:** [Link to paper]({url})\n")
120
+ summaries.append(f"**Summary:** {summary}\n")
121
+ summaries.append(f"**Lazyman Summary:**\n ")
122
+ summaries.append(f"{ref_summary}")
123
+ summaries = display_results(summaries)
124
+ print(summaries)
125
  return summaries
126
 
127
 
128
+ client = AsyncGroq(api_key="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")
129
+
130
+ async def chat_with_replit(message, history):
131
+ messages = []
132
+
133
+ for chat in history:
134
+ user = str(chat[0])
135
+ assistant = str(chat[1])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
+ messages.append({"role": "system", "content": "You are assistor. I will ask you some questions than you should answer!"})
138
+ messages.append({"role": 'user', "content": user})
139
+ messages.append({"role": 'assistant', "content": assistant})
140
 
141
+ messages = messages + [
142
+ {
143
+ "role": "user",
144
+ "content": str(message),
145
+ },
146
+ ]
147
 
148
+ print(messages)
 
 
149
 
150
+ response_content = ""
151
+ stream = await client.chat.completions.create(
152
+ messages=messages,
153
+ model="llama3-70b-8192",
154
+ temperature=0,
155
+ max_tokens=1024,
156
+ top_p=1,
157
+ stop=None,
158
+ stream=True,
159
+ )
160
+ async for chunk in stream:
161
+ content = chunk.choices[0].delta.content
162
+ if content:
163
+ response_content += chunk.choices[0].delta.content
164
+ yield response_content
165
+
166
+ js = """<script src="https://replit.com/public/js/replit-badge-v2.js" theme="dark" position="bottom-right"></script>"""
167
+
168
+
169
+ async def chat_with_replit_pdf(message, history, doi_num):
170
+ messages = []
171
+
172
+ old_doi = "old"
173
+ if old_doi != doi_num:
174
+ loader = ArxivLoader(query=str(doi_num), load_max_docs=10)
175
+ documents = loader.load_and_split()
176
+ metadata = documents[0].metadata
177
+ vector_store = Chroma.from_documents(documents, embedding_model)
178
+ old_doi = doi_num
179
+ def retrieve_relevant_content(user_query):
180
+ results = vector_store.similarity_search(user_query, k=3)
181
+ relevant_content = "\n\n".join([doc.page_content for doc in results])
182
+ return relevant_content
183
+ relevant_content = retrieve_relevant_content(message)
184
+
185
+
186
+ messages = messages + [
187
+ {
188
+ "role": "user",
189
+ "content": str(message),
190
+ },
191
+ {
192
+ "role": "system",
193
+ "content": f"You should answer about this arxiv paper for {doi_num}.\n"
194
+ f"This is the metadata of the paper:{metadata}.\n"
195
+ f"This is relevant information of the paper:{relevant_content}.\n"
196
+ }
197
+ ]
198
+
199
+ print(messages)
200
 
201
+ response_content = ""
202
+ stream = await client.chat.completions.create(
203
+ messages=messages,
204
+ model="llama3-70b-8192",
205
+ temperature=0,
206
+ max_tokens=1024,
207
+ top_p=1,
208
+ stop=None,
209
+ stream=False,
210
+ )
211
+ return stream.choices[0].message.content;
212
+
213
 
214
+ with gr.Blocks() as app:
215
+ with gr.Tab(label="Arxiv summarization"):
216
+ with gr.Column():
217
+ number = gr.Textbox(label="Enter your arxiv number")
218
+ sumarxiv_btn = gr.Button(value="summarize-arxiv")
219
+ with gr.Column():
220
+ outputs = gr.Markdown(label="Summary")
221
+ sumarxiv_btn.click(summarize_arxiv_pdf, inputs=number, outputs=outputs)
222
+ with gr.Tab(label="Local summarization"):
223
+ with gr.Row():
224
+ with gr.Column():
225
+ input_path = gr.File(label="Upload PDF file")
226
+ with gr.Column():
227
+ # set_temperature = gr.Slider(0, 1, value=0, step=0.1, label="temperature")
228
+ set_max_length = gr.Slider(512, 4096, value=2048, step=512, label="max length")
229
+ sumlocal_btn = gr.Button(value="summarize-local")
230
+ with gr.Row():
231
+ output_local = gr.Textbox(label="summary")
232
+ sumlocal_btn.click(summarize_pdf, inputs=[input_path, set_max_length], outputs=output_local)
233
+ with gr.Tab(label="ChatBot"):
234
+ gr.ChatInterface(chat_with_replit,
235
+ examples=[
236
+ "Explain about the attention is all you need",
237
+ "Who is the inventor of the GAN",
238
+ "What is the main idea style transfer?"
239
+ ])
240
+ with gr.Tab(label="Chat with pdf"):
241
+ gr.ChatInterface(fn = chat_with_replit_pdf,
242
+ additional_inputs = [
243
+ gr.Textbox(label="doi", placeholder="Enter doi number")
244
+ ],
245
+ type="messages")
246
+ app.launch()