Spaces:

Himanshusingh
/

sec_sentiment_analysis

Running

App Files Files Community

sec_sentiment_analysis / app.py

oceankim

Update app.py

9794d50 verified 5 months ago

raw

history blame

4.46 kB

	import gradio
	import torch

	from transformers import pipeline
	from transformers import BertForSequenceClassification, BertTokenizer


	tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')

	model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')

	summarizer = pipeline('summarization', model='t5-base')

	classifier_emotions = ['positive', 'neutral', 'negative']
	# classifier_model_name = 'bhadresh-savani/distilbert-base-uncased-emotion'
	# classifier_emotions = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']

	def summarize_sentences(sentences_by_emotion, min_length, max_length):
	for k in sentences_by_emotion.keys():
	if (len(sentences_by_emotion[k])!=0):
	text = ' '.join(sentences_by_emotion[k])
	summary = summarizer(text, min_length=min_length, max_length=max_length)
	print(f"{k.upper()}: {summary[0]['summary_text']}\n")


	def chunk_text_to_window_size_and_predict_proba(input_ids, attention_mask, total_len):
	"""
	This function splits the given input text into chunks of a specified window length,
	applies transformer model to each chunk and computes probabilities of each class for each chunk.
	The computed probabilities are then appended to a list.

	Args:
	input_ids (List[int]): List of token ids representing the input text.
	attention_mask (List[int]): List of attention masks corresponding to input_ids.
	total_len (int): Total length of the input_ids.

	Returns:
	proba_list (List[torch.Tensor]): List of probability tensors for each chunk.
	"""
	proba_list = []

	start = 0
	window_length = 510

	loop = True
	count = 1
	print(f'Total Length: {total_len}')

	while loop:
	end = start + window_length
	# If the end index exceeds total length, set the flag to False and adjust the end index
	if (end >= total_len) or (count >= 25):
	loop = False


	print(f'Start: {start}')
	print(f'End: {end}')

	# 1 => Define the text chunk
	input_ids_chunk = input_ids[start : end]
	attention_mask_chunk = attention_mask[start : end]

	# 2 => Append [CLS] and [SEP]
	input_ids_chunk = [101] + input_ids_chunk + [102]
	attention_mask_chunk = [1] + attention_mask_chunk + [1]

	#3 Convert regular python list to Pytorch Tensor
	input_dict = {
	'input_ids' : torch.Tensor([input_ids_chunk]).long(),
	'attention_mask' : torch.Tensor([attention_mask_chunk]).int()
	}

	outputs = model(**input_dict)

	decoded = tokenizer.decode(input_ids_chunk)
	print(f'Loop Count: {count}')
	count = count + 1
	print("########:", decoded , ":##############")

	probabilities = torch.nn.functional.softmax(outputs[0], dim = -1)
	print("########:", probabilities , ":##############")
	proba_list.append(probabilities)
	start = end

	return proba_list

	def get_mean_from_proba(proba_list):
	"""
	This function computes the mean probabilities of class predictions over all the chunks.

	Args:
	proba_list (List[torch.Tensor]): List of probability tensors for each chunk.

	Returns:
	mean (torch.Tensor): Mean of the probabilities across all chunks.
	"""

	# Ensures that gradients are not computed, saving memory
	with torch.no_grad():
	# Stack the list of tensors into a single tensor
	stacks = torch.stack(proba_list)

	# Resize the tensor to match the dimensions needed for mean computation
	stacks = stacks.resize(stacks.shape[0], stacks.shape[2])

	# Compute the mean along the zeroth dimension (i.e., the chunk dimension)
	mean = stacks.mean(dim = 0)

	return mean


	input_ids = tokens['input_ids']
	total_len = len(input_ids)
	attention_mask = tokens['attention_mask']

	proba_list = chunk_text_to_window_size_and_predict_proba(input_ids, attention_mask, total_len )
	mean = get_mean_from_proba(proba_list)
	sentiment = torch.argmax(mean).item()

	if sentiment == 0:
	return "Positive Sentiment"
	elif sentiment == 1:
	return "Negative Sentiment"
	else:
	return "Neutral"

	gr_interface = gradio.Interface(
	fn = my_inference_function,
	inputs = "text",
	outputs = "text"
	)

	gr_interface.launch()