Spaces:

jaisidhsingh
/

cluster-summ

Runtime error

App Files Files Community

jaisidhsingh commited on Jun 28, 2023

Commit

f98f59d

1 Parent(s): c548571

add code

Browse files

Files changed (7) hide show

app.py.py +162 -0
models/__pycache__/summarizers.cpython-39.pyc +0 -0
models/summarizers.py +56 -0
requirements.txt +0 -0
summarize.py +82 -0
utils/clustering.py +64 -0
utils/sentence_embedding.py +44 -0

app.py.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import streamlit as st
+from summarize import *
+from utils.sentence_embedding import *
+from utils.clustering import *
+from models.summarizers import *
+from nltk.tokenize import sent_tokenize, word_tokenize
+import math
+from time import perf_counter
+START = False
+COMPLETED = False
+PLACEHOLDER = "Enter your article"
+st.markdown("Extractive Summarization for Large Articles 😊")
+article = st.text_input(
+    label="Welcome, enter your article, press enter, and then Summarize",
+    value=PLACEHOLDER,
+)
+model_name = st.sidebar.selectbox(
+    label="Pick your model of choice:",
+    options=("BART", "Pegasus", "Distill-BART", "RoBERTa")
+)
+max_length = st.sidebar.slider(
+	label="Choose the maximum length of the summary",
+	min_value=100,
+	max_value=500,
+	value=250
+)
+min_length = st.sidebar.slider(
+	label="Choose the minimum length of the summary",
+	min_value=20,
+	max_value=150,
+	value=50
+)
+go = st.button(
+    label="Summarize",
+    key=0,
+)
+reset = st.button(
+	label="Reset",
+	key=1,
+)
+START = go
+tmp_out = st.empty()
+if reset:
+	COMPLETED = not reset
+	tmp_out.empty()
+else:
+	COMPLETED = reset
+bar = st.progress(0)
+if START and not COMPLETED:
+	start_time = perf_counter()
+	with tmp_out.container():
+		st.write("Loading in models and preparing article...")
+	summarization_model, summarization_tokenizer = load_summarizer(model_name)
+	summarizer_token_limit = summarization_tokenizer.model_max_length
+	if "pegasus" in model_name.lower():
+		input_toks = sent_tokenize(article)
+		input_sent_toks = input_toks
+		input_word_toks = word_tokenize(article)
+		num_toks = len(input_toks)
+	else:
+		input_toks = word_tokenize(article)
+		input_word_toks = input_toks
+		input_sent_toks = sent_tokenize(article)
+		num_toks = len(input_toks)
+	bar.progress(15)
+	if num_toks <= summarizer_token_limit:
+		with tmp_out.container():
+			st.write("Input token count (",num_toks,") <= token limit (",summarizer_token_limit,"), skipping optimization ...")
+		pred_summary = summarize_input(article, summarization_model, summarization_tokenizer)
+		end_time = perf_counter()
+		time_taken = end_time - start_time
+		bar.progress(100)
+	else:
+		with tmp_out.container():
+			st.write("Input token count (",num_toks,") > token limit (",summarizer_token_limit,"), optimizing ...")
+			st.write(f"Going Beyong {model_name} Token limit:", summarizer_token_limit)
+		input_sent_toks = sent_tokenize(article)
+		embeddings = make_embeddings(input_sent_toks, mean_pooling)
+		embeddings = embeddings.numpy()
+		bar.progress(30)
+		n_clusters_estimate = math.ceil(num_toks / summarizer_token_limit)
+		clemb = ClusterEmbeddings(
+			cluster_estimate=n_clusters_estimate,
+			cluster_fn="agglo", # much better
+			embeddings=embeddings,
+			sentences=np.array(input_sent_toks),
+			words=np.array(input_word_toks)
+		)
+		bar.progress(50)
+		curr = 50
+		rem = 90 - curr
+		sentence_clusters = clemb.get_sentence_clusters()
+		n = len(sentence_clusters)
+		summs = ""
+		for cluster in sentence_clusters:
+			cluster_summary = summarize_input(
+				cluster,
+				summarization_model,
+				summarization_tokenizer,
+				max_length=250,
+				min_length=50,
+			)
+			if type(cluster_summary) == list:
+				cluster_summary = cluster_summary[0]
+			summs += cluster_summary + " "
+			inc = rem / n
+			bar.progress((curr + inc)/100)
+		bar.progress(90)
+		pred_summary = summarize_input(
+			summs,
+			summarization_model,
+			summarization_tokenizer,
+			max_length=max_length,
+			min_length=min_length,
+		)
+		bar.progress(100)
+		end_time = perf_counter()
+		time_taken = end_time - start_time
+	with tmp_out.container():
+		st.write(f"Took {time_taken} seconds")
+		st.write(f"Summary: {pred_summary}")
+	START = False
+	COMPLETED = True
+else:
+	pass

models/__pycache__/summarizers.cpython-39.pyc ADDED Viewed

Binary file (1.58 kB). View file

models/summarizers.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from transformers import BartTokenizer, BartForConditionalGeneration
+from transformers import PegasusForConditionalGeneration, PegasusTokenizer
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+def load_summarizer(model_code):
+    name_dict = {
+        "bart": "facebook/bart-large-cnn",
+        "distill-bart": "sshleifer/distilbart-cnn-12-6",
+        "roberta": "google/roberta2roberta_L-24_cnn_daily_mail",
+        "pegasus": "google/pegasus-cnn_dailymail"
+    }
+    model_name = name_dict[model_code.lower()]
+    model, tokenizer = None, None
+    if "bart" in model_name:
+        tokenizer = BartTokenizer.from_pretrained(model_name)
+        model = BartForConditionalGeneration.from_pretrained(model_name)
+    if "pegasus" in model_name:
+        tokenizer = PegasusTokenizer.from_pretrained(model_name)
+        model = PegasusForConditionalGeneration.from_pretrained(model_name)
+    if "roberta" in model_name:
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+    return model, tokenizer
+def summarize_input(
+        input_article,
+        model,
+        tokenizer,
+        max_length=150,
+        min_length=50,
+        num_beams=3,
+        length_penalty=0.5,
+        no_repeat_ngram_size=3
+    ):
+    text_input_ids = tokenizer.batch_encode_plus(
+        [input_article],
+        return_tensors='pt',
+        max_length=tokenizer.model_max_length
+    )['input_ids'].to("cpu")
+    summary_ids = model.generate(
+        text_input_ids,
+        num_beams=int(num_beams),
+        length_penalty=float(length_penalty),
+        max_length=int(max_length),
+        min_length=int(min_length),
+        no_repeat_ngram_size=int(no_repeat_ngram_size)
+    )
+    summary_txt = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)
+    return summary_txt.replace("<n>", "")

requirements.txt ADDED Viewed

Binary file (9.19 kB). View file

summarize.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from utils.sentence_embedding import *
+from utils.clustering import *
+from models.summarizers import *
+from nltk.tokenize import sent_tokenize, word_tokenize
+import math
+from time import perf_counter
+import time
+def get_summary(model_name, article, max_length, min_length, increment):
+	start_time = perf_counter()
+	summarization_model, summarization_tokenizer = load_summarizer(model_name)
+	summarizer_token_limit = summarization_tokenizer.model_max_length
+	print("Going Beyong Token limit:", summarizer_token_limit)
+	input_word_toks = word_tokenize(article)
+	num_words = len(input_word_toks)
+	if num_words <= summarizer_token_limit and model_name == "t5":
+		pred_summary = summarize_input(article, summarization_model, summarization_tokenizer)
+		end_time = perf_counter()
+		print("Time taken: ", end_time - start_time)
+	else:
+		input_sent_toks = sent_tokenize(article)
+		embeddings = make_embeddings(input_sent_toks, mean_pooling)
+		embeddings = embeddings.numpy()
+		increment[0] = 20
+		n_clusters_estimate = math.ceil(num_words / summarizer_token_limit)
+		clemb = ClusterEmbeddings(
+			cluster_estimate=n_clusters_estimate,
+			cluster_fn="agglo", # much better
+			embeddings=embeddings,
+			sentences=np.array(input_sent_toks),
+			words=np.array(input_word_toks)
+		)
+		increment[0] = 50
+		sentence_clusters = clemb.get_sentence_clusters()
+		n = len(sentence_clusters)
+		summs = ""
+		for cluster in sentence_clusters:
+			cluster_summary = summarize_input(
+				cluster,
+				summarization_model,
+				summarization_tokenizer,
+				max_length=250,
+				min_length=50,
+			)
+			if type(cluster_summary) == list:
+				cluster_summary = cluster_summary[0]
+			summs += cluster_summary + " "
+			increment[0] += 40 / n
+		pred_summary = summarize_input(
+			summs,
+			summarization_model,
+			summarization_tokenizer,
+			max_length=max_length,
+			min_length=min_length,
+		)
+		increment[0] += 100
+		end_time = perf_counter()
+		time_taken = end_time - start_time
+	return pred_summary, time_taken
+def test():
+	article = """Recent text-to-image matching models apply contrastive learning to large corpora of uncurated pairs of images and sentences. While such models can provide a powerful score for matching and subsequent zero-shot tasks, they are not capable of generating caption given an image. In this work, we repurpose such models to generate a descriptive text given an image at inference time, without any further training or tuning step. This is done by combining the visual-semantic model with a large language model, benefiting from the knowledge in both web-scale models. The resulting captions are much less restrictive than those obtained by supervised captioning methods. Moreover, as a zero-shot learning method, it is extremely flexible and wedemonstrate its ability to perform image arithmetic in which the inputs can be either images or text and the output is a sentence."""
+	model_name = "BART"
+	summ, time_taken = get_summary(model_name, article, 250, 150)
+	print(summ)
+	print(time_taken)

utils/clustering.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import torch
+from sklearn.cluster import AgglomerativeClustering, KMeans
+from sklearn.manifold import TSNE
+import numpy as np
+import matplotlib.pyplot as plt
+class ClusterEmbeddings():
+	def __init__(
+		self,
+		cluster_estimate,
+		cluster_fn,
+		embeddings,
+		sentences,
+		words
+	):
+		self.cluster_estimate = cluster_estimate
+		self.embeddings = embeddings
+		self.sentences = sentences
+		self.words = words
+		self.cluster_fn = cluster_fn
+		if self.cluster_fn == "agglo":
+			self.clustering_algo = AgglomerativeClustering(n_clusters=self.cluster_estimate)
+			self.num_clusters = cluster_estimate
+		elif self.cluster_fn == "kmeans":
+			self.clustering_algo = KMeans(n_clusters=self.cluster_estimate)
+			self.num_clusters = cluster_estimate
+		self.cluster = self.clustering_algo.fit(embeddings)
+		self.labels = self.cluster.labels_
+	def get_sentence_clusters(self):
+		sent_clusters = []
+		chunk = ""
+		for lbl in range(self.num_clusters):
+			single_cluster = self.sentences[self.labels == lbl]
+			for sent in single_cluster:
+				chunk += sent + " "
+			sent_clusters.append(chunk)
+			chunk = ""
+		return np.array(sent_clusters)
+	def make_plot(self):
+		projector = TSNE(
+			n_components=2,
+			learning_rate="auto",
+			init="random"
+		)
+		proj_embeddings = np.array(
+			projector.fit_transform(self.embeddings)
+		)
+		for lbl in range(self.num_clusters):
+			xs = proj_embeddings[self.labels == lbl]
+			plt.scatter(xs[:, 0], xs[:, 1], label=f"Cluster {lbl}")
+		plt.legend()
+		plt.xlabel("x1")
+		plt.ylabel("x2")
+		plt.show()

utils/sentence_embedding.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import os
+import sys
+cwd = os.getcwd()
+module2add = '\\'.join(cwd.split("\\")[:-1])
+sys.path.append(module2add)
+from configs.model_config import cfg as model_configs
+from transformers import AutoTokenizer, AutoModel
+import torch
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0]
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
+    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+    return sum_embeddings / sum_mask
+def make_embeddings(sentence_list, pool_fn):
+    tokenizer = AutoTokenizer.from_pretrained(model_configs.sent_model_name)
+    model = AutoModel.from_pretrained(model_configs.sent_model_name)
+    encoded_input = tokenizer(
+        sentence_list,
+        padding=True,
+        truncation=True,
+        max_length=model_configs.sent_model_seq_limit,
+        return_tensors='pt'
+    )
+    with torch.no_grad():
+        embeddings = model(**encoded_input)
+    attn_mask = encoded_input['attention_mask']
+    sentence_embeddings = pool_fn(embeddings, attn_mask)
+    return sentence_embeddings
+def test_embedder():
+    sentences = ['This framework generates embeddings for each input sentence',
+             'Sentences are passed as a list of string.',
+             'The quick brown fox jumps over the lazy dog.']
+    embeddings = make_embeddings(sentences)
+    print(embeddings.shape)