Alanturner2 commited on
Commit
7c1cc3d
·
verified ·
1 Parent(s): d852a14

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +247 -0
app.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from langchain.document_loaders import ArxivLoader
3
+ from PyPDF2 import PdfReader
4
+ from langchain_community.llms import HuggingFaceHub
5
+ from langchain.text_splitter import TokenTextSplitter
6
+ from langchain.chains.summarize import load_summarize_chain
7
+ from langchain.document_loaders import PyPDFLoader
8
+ from transformers import pipeline
9
+
10
+ from dotenv import load_dotenv
11
+ import os
12
+
13
+ load_dotenv()
14
+ hugging_api_key = os.getenv('HUGGING_API_KEY')
15
+
16
+ from groq import AsyncGroq
17
+ from groq import Groq
18
+
19
+ from langchain_groq import ChatGroq
20
+ from langchain.document_loaders import ArxivLoader
21
+ from langchain.vectorstores import Chroma
22
+ from langchain.chains import RetrievalQA
23
+ from langchain.embeddings.huggingface_hub import HuggingFaceHubEmbeddings
24
+ from huggingface_hub import login
25
+ login(hugging_api_key)
26
+ embedding_model = HuggingFaceHubEmbeddings(huggingfacehub_api_token=hugging_api_key)
27
+ llm = ChatGroq(temperature=0, model_name="llama3-70b-8192", api_key = "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")
28
+
29
+ def display_results(result):
30
+ return "\n".join(result) # Join each entry with double newlines for better readability
31
+
32
+ def summarize_pdf(pdf_file_path, max_length):
33
+ # summarizer = pipeline('summarization', model='allenai/led-large-16384-arxiv', min_length=100, max_length=max_length, device=0)
34
+ loader = PdfReader(pdf_file_path)
35
+ text = """ """
36
+ for page in loader.pages:
37
+ text += page.extract_text()
38
+
39
+ text_splitter = TokenTextSplitter(chunk_size=8192, chunk_overlap=1000)
40
+ chunks = text_splitter.split_text(text)
41
+ summary = ""
42
+ for i in range(len(chunks)):
43
+ # text = chunks[i].page_content
44
+ text = chunks[i]
45
+ summary += summarize_text(text)
46
+ # summary = str(max_length)
47
+ return summary
48
+
49
+ def summarize_text(text):
50
+ sum_client = Groq(api_key="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")
51
+ messages = []
52
+ # messages.append({"role": "system", "content": "You are arxiv paper summarizer. If I give you the doi number, you should only output summarization. Summarization should be more than 10% words of the paper. For example, in the paper there are 500 words, than summarization should be more than 50 words."})
53
+ messages.append({"role": "system", "content": "You are summarizer. If I give you the whole text you should summarize it. And you don't need the title and author"})
54
+ messages = messages + [
55
+ {
56
+ "role": "user",
57
+ "content": f"Summarize the paper. The whole text is {text}",
58
+ },
59
+ ]
60
+ response = sum_client.chat.completions.create(
61
+ messages=messages,
62
+ model="llama3-70b-8192",
63
+ temperature=0,
64
+ max_tokens=8192,
65
+ top_p=1,
66
+ stop=None
67
+ )
68
+ text_summary = response.choices[0].message.content
69
+ return text_summary
70
+
71
+
72
+
73
+
74
+ def remove_first_sentence_and_title(text):
75
+ # Remove the first sentence
76
+ first_sentence_end = text.find('. ') + 2 # Find the end of the first sentence
77
+ text_without_first_sentence = text[first_sentence_end:]
78
+
79
+ # Remove the title
80
+ title_start = text_without_first_sentence.find('**Title:**')
81
+ if title_start != -1:
82
+ title_end = text_without_first_sentence.find('\n', title_start)
83
+ if title_end != -1:
84
+ text_without_title = text_without_first_sentence[:title_start] + text_without_first_sentence[title_end+1:]
85
+ else:
86
+ text_without_title = text_without_first_sentence[:title_start]
87
+ else:
88
+ text_without_title = text_without_first_sentence
89
+
90
+ return text_without_title.strip()
91
+
92
+
93
+
94
+ def summarize_arxiv_pdf(query):
95
+ loader = ArxivLoader(query=query, load_max_docs=10)
96
+ documents = loader.load()
97
+ text_splitter = TokenTextSplitter(chunk_size=5700, chunk_overlap=100)
98
+ chunks = text_splitter.split_documents(documents)
99
+
100
+ text = documents[0].page_content
101
+
102
+
103
+ ref_summary = ""
104
+ for i in range(len(chunks)):
105
+ text = chunks[i].page_content
106
+ ref_summary += summarize_text(text)
107
+ # ref_summary = ref_summary.split('paper:')[1]
108
+ # ref_summary = remove_first_sentence_and_title(ref_summary)
109
+ ref_summary = ref_summary.replace("Here is a summary of the paper:", "").strip()
110
+ arxiv_summary = loader.get_summaries_as_docs()
111
+
112
+ summaries = []
113
+ for doc in arxiv_summary:
114
+ title = doc.metadata.get("Title")
115
+ authors = doc.metadata.get("Authors")
116
+ url = doc.metadata.get("Entry ID")
117
+ summary = doc.page_content
118
+ summaries.append(f"**{title}**\n")
119
+ summaries.append(f"**Authors:** {authors}\n")
120
+ summaries.append(f"**View full paper:** [Link to paper]({url})\n")
121
+ summaries.append(f"**Summary:** {summary}\n")
122
+ summaries.append(f"**Lazyman Summary:**\n ")
123
+ summaries.append(f"{ref_summary}")
124
+ summaries = display_results(summaries)
125
+ print(summaries)
126
+ return summaries
127
+
128
+
129
+ client = AsyncGroq(api_key="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")
130
+
131
+ async def chat_with_replit(message, history):
132
+ messages = []
133
+
134
+ for chat in history:
135
+ user = str(chat[0])
136
+ assistant = str(chat[1])
137
+
138
+ messages.append({"role": "system", "content": "You are assistor. I will ask you some questions than you should answer!"})
139
+ messages.append({"role": 'user', "content": user})
140
+ messages.append({"role": 'assistant', "content": assistant})
141
+
142
+ messages = messages + [
143
+ {
144
+ "role": "user",
145
+ "content": str(message),
146
+ },
147
+ ]
148
+
149
+ print(messages)
150
+
151
+ response_content = ""
152
+ stream = await client.chat.completions.create(
153
+ messages=messages,
154
+ model="llama3-70b-8192",
155
+ temperature=0,
156
+ max_tokens=1024,
157
+ top_p=1,
158
+ stop=None,
159
+ stream=True,
160
+ )
161
+ async for chunk in stream:
162
+ content = chunk.choices[0].delta.content
163
+ if content:
164
+ response_content += chunk.choices[0].delta.content
165
+ yield response_content
166
+
167
+ js = """<script src="https://replit.com/public/js/replit-badge-v2.js" theme="dark" position="bottom-right"></script>"""
168
+
169
+
170
+ async def chat_with_replit_pdf(message, history, doi_num):
171
+ messages = []
172
+
173
+ old_doi = "old"
174
+ if old_doi != doi_num:
175
+ loader = ArxivLoader(query=str(doi_num), load_max_docs=10)
176
+ documents = loader.load_and_split()
177
+ metadata = documents[0].metadata
178
+ vector_store = Chroma.from_documents(documents, embedding_model)
179
+ old_doi = doi_num
180
+ def retrieve_relevant_content(user_query):
181
+ results = vector_store.similarity_search(user_query, k=3)
182
+ relevant_content = "\n\n".join([doc.page_content for doc in results])
183
+ return relevant_content
184
+ relevant_content = retrieve_relevant_content(message)
185
+
186
+
187
+ messages = messages + [
188
+ {
189
+ "role": "user",
190
+ "content": str(message),
191
+ },
192
+ {
193
+ "role": "system",
194
+ "content": f"You should answer about this arxiv paper for {doi_num}.\n"
195
+ f"This is the metadata of the paper:{metadata}.\n"
196
+ f"This is relevant information of the paper:{relevant_content}.\n"
197
+ }
198
+ ]
199
+
200
+ print(messages)
201
+
202
+ response_content = ""
203
+ stream = await client.chat.completions.create(
204
+ messages=messages,
205
+ model="llama3-70b-8192",
206
+ temperature=0,
207
+ max_tokens=1024,
208
+ top_p=1,
209
+ stop=None,
210
+ stream=False,
211
+ )
212
+ return stream.choices[0].message.content;
213
+
214
+
215
+ with gr.Blocks() as app:
216
+ with gr.Tab(label="Arxiv summarization"):
217
+ with gr.Column():
218
+ number = gr.Textbox(label="Enter your arxiv number")
219
+ sumarxiv_btn = gr.Button(value="summarize-arxiv")
220
+ with gr.Column():
221
+ outputs = gr.Markdown(label="Summary", height=1000)
222
+ sumarxiv_btn.click(summarize_arxiv_pdf, inputs=number, outputs=outputs)
223
+ with gr.Tab(label="Local summarization"):
224
+ with gr.Row():
225
+ with gr.Column():
226
+ input_path = gr.File(label="Upload PDF file")
227
+ with gr.Column():
228
+ # set_temperature = gr.Slider(0, 1, value=0, step=0.1, label="temperature")
229
+ set_max_length = gr.Slider(512, 4096, value=2048, step=512, label="max length")
230
+ sumlocal_btn = gr.Button(value="summarize-local")
231
+ with gr.Row():
232
+ output_local = gr.Markdown(label="summary", height=1000)
233
+ sumlocal_btn.click(summarize_pdf, inputs=[input_path, set_max_length], outputs=output_local)
234
+ with gr.Tab(label="ChatBot"):
235
+ gr.ChatInterface(chat_with_replit,
236
+ examples=[
237
+ "Explain about the attention is all you need",
238
+ "Who is the inventor of the GAN",
239
+ "What is the main idea style transfer?"
240
+ ])
241
+ with gr.Tab(label="Chat with pdf"):
242
+ gr.ChatInterface(fn = chat_with_replit_pdf,
243
+ additional_inputs = [
244
+ gr.Textbox(label="doi", placeholder="Enter doi number")
245
+ ],
246
+ type="messages")
247
+ app.launch()