Spaces:

detectROSE
/

TermsAndConditionsTextSummarizer

Runtime error

App Files Files Community

TermsAndConditionsTextSummarizer / app.py

detectROSE

first_push

55750e2 verified 7 months ago

raw

history blame contribute delete

5.43 kB

	# -- coding: utf-8 --
	"""FinalProject_TextClassificationFineTuning.ipynb

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/1fCS36Rnww__14QDdcsjjG5hfFL83gzpU
	"""

	!pip install opendatasets
	!pip install gradio --quiet
	!pip install transformers[sentencepeice] datasets sacrebleu rouge_score py7zr -q
	!!pip install rake-nltk # used to determine the key phrases in the text
	! pip install kaggle

	#After Done, delete all teh models that are not needed
	import opendatasets as od
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	from sklearn.model_selection import train_test_split
	import opendatasets as od
	import gradio as gr
	from transformers import pipeline
	import matplotlib.pyplot as plt
	from datasets import load_dataset, load_metric
	from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

	import nltk
	from nltk.tokenize import sent_tokenize

	nltk.download("punkt")

	from datasets import load_dataset
	from transformers import pipeline

	from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
	import torch

	device = "cuda" if torch.cuda.is_available() else "cpu"
	device

	# Pretrained-Dataset is this one: PEGASUS MODEL retrieved form https://huggingface.co/nsi319/legal-pegasus

	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

	tokenizer = AutoTokenizer.from_pretrained("nsi319/legal-pegasus")
	model = AutoModelForSeq2SeqLM.from_pretrained("nsi319/legal-pegasus")

	text = """ """

	input_tokenized = tokenizer.encode(text, return_tensors='pt',max_length=1024,truncation=True)
	summary_ids = model.generate(input_tokenized,
	num_beams=9,
	no_repeat_ngram_size=3,
	length_penalty=2.0,
	min_length=150,
	max_length=250,
	early_stopping=True)
	summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids][0]
	### Summary Output

	# The Securities and Exchange Commission today charged AT&T, Inc. and three of its Investor Relations executives with aiding and abetting the company's violations of the antifraud provisions of Section 10(b) of the Securities Exchange Act of 1934 and Rule 10b-5 thereunder. According to the SEC's complaint, the company learned in March 2016 that a steeper-than-expected decline in its first quarter smartphone sales would cause its revenue to fall short of analysts' estimates for the quarter. The complaint alleges that to avoid falling short of the consensus revenue estimate for the third consecutive quarter, the executives made private, one-on-one phone calls to analysts at approximately 20 separate firms. On these calls, the SEC alleges that Christopher Womack, Michael Black, and Kent Evans allegedly disclosed internal smartphone sales data and the impact of that data on internal revenue metrics. The SEC further alleges that as a result of what they were told, the analysts substantially reduced their revenue forecasts, leading to the overall consensus Revenue Estimate falling to just below the level that AT&t ultimately reported to the public on April 26, 2016. The SEC is seeking permanent injunctive relief and civil monetary penalties against each defendant.

	summary

	#Here we load the ToS dataset for additional finetuning.... this step is optional and doing so only improves our model
	#The only issue with this is that it requires GPU and runtime disconnects and crashes since I dont have access to GPU or compute power it needs

	#Loading the Dataset
	# Assign the Kaggle data set URL into variable
	dataset = 'https://www.kaggle.com/datasets/simple11/tos-summaries'
	# Using opendatasets let's download the data sets
	od.download(dataset)

	dataset = pd.read_json('/content/tos-summaries/dataset.json', lines = True)
	dataset

	#print(dataset.head(6))

	print(f"Summary: \n{summary}")

	'summarization Gradio for my program'

	def summarize_text(text):
	#changed this to "inputs"
	inputs = tokenizer.encode(text, return_tensors='pt',max_length=1024,truncation=True)

	#generate summary
	summary_ids = model.generate(input_tokenized,
	num_beams=9,
	no_repeat_ngram_size=3,
	length_penalty=2.0,
	min_length=150,
	max_length=250,
	early_stopping=True)

	# Decode and return the summary
	return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

	model = AutoModelForSeq2SeqLM.from_pretrained("nsi319/legal-pegasus")
	tokenizer = AutoTokenizer.from_pretrained("nsi319/legal-pegasus")

	interface = gr.Interface(
	fn=summarize_text,
	inputs=gr.Textbox(lines=10, placeholder='Enter Text Here...', label='Input text'),
	outputs=gr.Textbox(label='Summarized Text'),
	title='Terms and Conditions Text Summarizer'
	)
	interface.launch()

	########################################################################################################

	import nltk
	from rake_nltk import Rake
	nltk.download('stopwords')
	nltk.download('punkt')

	# Uses stopwords for english from NLTK, and all puntuation characters by
	# default
	r = Rake()

	# Extraction given the text.
	r.extract_keywords_from_text(summary)

	# Obtain keyword phrases ranked from highest to lowest.
	r.get_ranked_phrases()

	# To get keyword phrases ranked highest to lowest with scores.
	r.get_ranked_phrases_with_scores()