robertselvam commited on
Commit
f0798cc
1 Parent(s): d655551

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +356 -0
app.py ADDED
@@ -0,0 +1,356 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import os
3
+ import pdfplumber
4
+ from langchain.chains.mapreduce import MapReduceChain
5
+ from langchain.text_splitter import CharacterTextSplitter
6
+ from langchain.chains.summarize import load_summarize_chain
7
+ from langchain.chat_models import ChatOpenAI
8
+ from langchain.document_loaders import UnstructuredFileLoader
9
+ from langchain.prompts import PromptTemplate
10
+ import logging
11
+ import json
12
+ from typing import List
13
+ import mimetypes
14
+ import validators
15
+ import requests
16
+ import tempfile
17
+ from bs4 import BeautifulSoup
18
+ from langchain.chains import create_extraction_chain
19
+ from GoogleNews import GoogleNews
20
+ import pandas as pd
21
+ import gradio as gr
22
+ import re
23
+ from langchain.document_loaders import WebBaseLoader
24
+ from langchain.chains.llm import LLMChain
25
+ from langchain.chains.combine_documents.stuff import StuffDocumentsChain
26
+ from transformers import pipeline
27
+ import plotly.express as px
28
+
29
+ class KeyValueExtractor:
30
+
31
+ def __init__(self):
32
+
33
+ """
34
+ Initialize the ContractSummarizer object.
35
+
36
+ Parameters:
37
+ pdf_file_path (str): The path to the input PDF file.
38
+ """
39
+ self.model = "facebook/bart-large-mnli"
40
+
41
+ def get_news(self,keyword):
42
+
43
+ googlenews = GoogleNews(lang='en', region='US', period='1d', encode='utf-8')
44
+ googlenews.clear()
45
+ googlenews.search(keyword)
46
+ googlenews.get_page(2)
47
+ news_result = googlenews.result(sort=True)
48
+ news_data_df = pd.DataFrame.from_dict(news_result)
49
+
50
+ news_data_df.info()
51
+
52
+ # Display header of dataframe.
53
+ news_data_df.head()
54
+
55
+ tot_news_link = []
56
+ for index, headers in news_data_df.iterrows():
57
+ news_link = str(headers['link'])
58
+ tot_news_link.append(news_link)
59
+
60
+ return tot_news_link
61
+
62
+ def url_format(self,urls):
63
+
64
+ tot_url_links = []
65
+ for url_text in urls:
66
+ # Define a regex pattern to match URLs starting with 'http' or 'https'
67
+ pattern = r'(https?://[^\s]+)'
68
+
69
+ # Search for the URL in the text using the regex pattern
70
+ match = re.search(pattern, url_text)
71
+
72
+ if match:
73
+ extracted_url = match.group(1)
74
+ tot_url_links.append(extracted_url)
75
+
76
+ else:
77
+ print("No URL found in the given text.")
78
+
79
+ return tot_url_links
80
+
81
+ def clear_error_ulr(self,urls):
82
+ error_url = []
83
+ for url in urls:
84
+ if validators.url(url):
85
+ headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',}
86
+ r = requests.get(url,headers=headers)
87
+ if r.status_code != 200:
88
+ # raise ValueError("Check the url of your file; returned status code %s" % r.status_code)
89
+ print(f"Error fetching {url}:")
90
+ error_url.append(url)
91
+ continue
92
+ cleaned_list_url = [item for item in urls if item not in error_url]
93
+ return cleaned_list_url
94
+
95
+ def get_each_link_summary(self,urls):
96
+
97
+ each_link_summary = ""
98
+
99
+ for url in urls:
100
+ loader = WebBaseLoader(url)
101
+ docs = loader.load()
102
+ text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
103
+ chunk_size=3000, chunk_overlap=200
104
+ )
105
+
106
+ # Split the documents into chunks
107
+ split_docs = text_splitter.split_documents(docs)
108
+
109
+ # Prepare the prompt template for summarization
110
+ prompt_template = """Write a concise summary of the following:
111
+ {text}
112
+ CONCISE SUMMARY:"""
113
+ prompt = PromptTemplate.from_template(prompt_template)
114
+
115
+ # Prepare the template for refining the summary with additional context
116
+ refine_template = (
117
+ "Your job is to produce a final summary\n"
118
+ "We have provided an existing summary up to a certain point: {existing_answer}\n"
119
+ "We have the opportunity to refine the existing summary"
120
+ "(only if needed) with some more context below.\n"
121
+ "------------\n"
122
+ "{text}\n"
123
+ "------------\n"
124
+ "Given the new context, refine the original summary"
125
+ "If the context isn't useful, return the original summary."
126
+ )
127
+ refine_prompt = PromptTemplate.from_template(refine_template)
128
+
129
+ # Load the summarization chain using the ChatOpenAI language model
130
+ chain = load_summarize_chain(
131
+ llm = ChatOpenAI(temperature=0),
132
+ chain_type="refine",
133
+ question_prompt=prompt,
134
+ refine_prompt=refine_prompt,
135
+ return_intermediate_steps=True,
136
+ input_key="input_documents",
137
+ output_key="output_text",
138
+ )
139
+
140
+ # Generate the refined summary using the loaded summarization chain
141
+ result = chain({"input_documents": split_docs}, return_only_outputs=True)
142
+ print(result["output_text"])
143
+
144
+ # Return the refined summary
145
+ each_link_summary = each_link_summary + result["output_text"]
146
+
147
+ return each_link_summary
148
+
149
+ def save_text_to_file(self,each_link_summary) -> str:
150
+
151
+ """
152
+ Load the text from the saved file and split it into documents.
153
+
154
+ Returns:
155
+ List[str]: List of document texts.
156
+ """
157
+
158
+ # Get the path to the text file where the extracted text will be saved
159
+ file_path = "extracted_text.txt"
160
+ try:
161
+ with open(file_path, 'w') as file:
162
+ # Write the extracted text into the text file
163
+ file.write(each_link_summary)
164
+ # Return the file path of the saved text file
165
+ return file_path
166
+ except IOError as e:
167
+ # If an IOError occurs during the file saving process, log the error
168
+ logging.error(f"Error while saving text to file: {e}")
169
+
170
+ def document_loader(self,file_path) -> List[str]:
171
+
172
+ """
173
+ Load the text from the saved file and split it into documents.
174
+
175
+ Returns:
176
+ List[str]: List of document texts.
177
+ """
178
+
179
+ # Initialize the UnstructuredFileLoader
180
+ loader = UnstructuredFileLoader(file_path, strategy="fast")
181
+ # Load the documents from the file
182
+ docs = loader.load()
183
+
184
+ # Return the list of loaded document texts
185
+ return docs
186
+
187
+ def document_text_spilliter(self,docs) -> List[str]:
188
+
189
+ """
190
+ Split documents into chunks for efficient processing.
191
+
192
+ Returns:
193
+ List[str]: List of split document chunks.
194
+ """
195
+
196
+ # Initialize the text splitter with specified chunk size and overlap
197
+ text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
198
+ chunk_size=3000, chunk_overlap=200
199
+ )
200
+
201
+ # Split the documents into chunks
202
+ split_docs = text_splitter.split_documents(docs)
203
+
204
+ # Return the list of split document chunks
205
+ return split_docs
206
+
207
+ def extract_key_value_pair(self,content) -> None:
208
+
209
+ """
210
+ Extract key-value pairs from the refined summary.
211
+
212
+ Prints the extracted key-value pairs.
213
+ """
214
+
215
+ try:
216
+
217
+ # Use OpenAI's Completion API to analyze the text and extract key-value pairs
218
+ response = openai.Completion.create(
219
+ engine="text-davinci-003", # You can choose a different engine as well
220
+ temperature = 0,
221
+ prompt=f"Get maximum count meaningfull key value pairs. content in backticks.```{content}```.",
222
+ max_tokens=1000 # You can adjust the length of the response
223
+ )
224
+
225
+ # Extract and return the chatbot's reply
226
+ result = response['choices'][0]['text'].strip()
227
+ return result
228
+ except Exception as e:
229
+ # If an error occurs during the key-value extraction process, log the error
230
+ logging.error(f"Error while extracting key-value pairs: {e}")
231
+ print("Error:", e)
232
+
233
+ def refine_summary(self,split_docs) -> str:
234
+
235
+ """
236
+ Refine the summary using the provided context.
237
+
238
+ Returns:
239
+ str: Refined summary.
240
+ """
241
+
242
+ # Prepare the prompt template for summarization
243
+ prompt_template = """Write a detalied broad abractive summary of the following:
244
+ {text}
245
+ CONCISE SUMMARY:"""
246
+ prompt = PromptTemplate.from_template(prompt_template)
247
+
248
+ # Prepare the template for refining the summary with additional context
249
+ refine_template = (
250
+ "Your job is to produce a final summary\n"
251
+ "We have provided an existing summary up to a certain point: {existing_answer}\n"
252
+ "We have the opportunity to refine the existing summary"
253
+ "(only if needed) with some more context below.\n"
254
+ "------------\n"
255
+ "{text}\n"
256
+ "------------\n"
257
+ "Given the new context, refine the original summary"
258
+ "If the context isn't useful, return the original summary."
259
+ )
260
+ refine_prompt = PromptTemplate.from_template(refine_template)
261
+
262
+ # Load the summarization chain using the ChatOpenAI language model
263
+ chain = load_summarize_chain(
264
+ llm = ChatOpenAI(temperature=0),
265
+ chain_type="refine",
266
+ question_prompt=prompt,
267
+ refine_prompt=refine_prompt,
268
+ return_intermediate_steps=True,
269
+ input_key="input_documents",
270
+ output_key="output_text",
271
+ )
272
+
273
+ # Generate the refined summary using the loaded summarization chain
274
+ result = chain({"input_documents": split_docs}, return_only_outputs=True)
275
+
276
+ key_value_pair = self.extract_key_value_pair(result["output_text"])
277
+
278
+ # Return the refined summary
279
+ return result["output_text"],key_value_pair
280
+
281
+ def analyze_sentiment_for_graph(self, text):
282
+ pipe = pipeline("zero-shot-classification", model=self.model)
283
+ label=["Positive", "Negative", "Neutral"]
284
+ result = pipe(text, label)
285
+ sentiment_scores = {
286
+ result['labels'][0]: result['scores'][0],
287
+ result['labels'][1]: result['scores'][1],
288
+ result['labels'][2]: result['scores'][2]
289
+ }
290
+ return sentiment_scores
291
+
292
+ def display_graph(self,text):
293
+
294
+ sentiment_scores = self.analyze_sentiment_for_graph(text)
295
+ labels = sentiment_scores.keys()
296
+ scores = sentiment_scores.values()
297
+ fig = px.bar(x=scores, y=labels, orientation='h', color=labels, color_discrete_map={"Negative": "red", "Positive": "green", "Neutral": "gray"})
298
+ fig.update_traces(texttemplate='%{x:.2f}%', textposition='outside')
299
+ fig.update_layout(title="Sentiment Analysis",width=800)
300
+
301
+ formatted_pairs = []
302
+ for key, value in sentiment_scores.items():
303
+ formatted_value = round(value, 2) # Round the value to two decimal places
304
+ formatted_pairs.append(f"{key} : {formatted_value}")
305
+
306
+ result_string = '\t'.join(formatted_pairs)
307
+
308
+ return fig
309
+
310
+ def main(self,keyword):
311
+
312
+ urls = self.get_news(keyword)
313
+ tot_urls = self.url_format(urls)
314
+ clean_url = self.clear_error_ulr(tot_urls)
315
+ each_link_summary = self.get_each_link_summary(clean_url)
316
+ file_path = self.save_text_to_file(each_link_summary)
317
+ docs = self.document_loader(file_path)
318
+ split_docs = self.document_text_spilliter(docs)
319
+ result = self.refine_summary(split_docs)
320
+
321
+
322
+ return result
323
+
324
+ def gradio_interface(self):
325
+
326
+ with gr.Blocks(css="style.css",theme= 'karthikeyan-adople/hudsonhayes-gray') as app:
327
+ gr.HTML("""<center class="darkblue" style='background-color:rgb(0,1,36); text-align:center;padding:25px;'><center><h1 class ="center">
328
+ <img src="file=logo.png" height="110px" width="280px"></h1></center>
329
+ <br><h1 style="color:#fff">summarizer</h1></center>""")
330
+ with gr.Row(elem_id="col-container"):
331
+ with gr.Column(scale=1.0, min_width=150, ):
332
+ input_news = gr.Textbox(label="NEWS")
333
+ with gr.Row(elem_id="col-container"):
334
+ with gr.Column(scale=1.0, min_width=150):
335
+ analyse = gr.Button("Analyse")
336
+ with gr.Row(elem_id="col-container"):
337
+ with gr.Column(scale=0.50, min_width=150):
338
+ result_summary = gr.Textbox(label="Summary")
339
+ with gr.Column(scale=0.50, min_width=150):
340
+ key_value_pair_result = gr.Textbox(label="Key Value Pair")
341
+ with gr.Row(elem_id="col-container"):
342
+ with gr.Column(scale=0.70, min_width=0):
343
+ plot =gr.Plot(label="Customer", size=(500, 600))
344
+ with gr.Row(elem_id="col-container"):
345
+ with gr.Column(scale=1.0, min_width=150):
346
+ analyse_sentiment = gr.Button("Analyse")
347
+
348
+ analyse.click(self.main, input_news, [result_summary,key_value_pair_result])
349
+ analyse_sentiment.click(self.display_graph,result_summary,[plot])
350
+
351
+ app.launch(debug=True)
352
+
353
+ if __name__ == "__main__":
354
+
355
+ text_process = KeyValueExtractor()
356
+ text_process.gradio_interface()