import gradio import torch from transformers import pipeline from transformers import BertForSequenceClassification, BertTokenizer tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert') model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert') summarizer = pipeline('summarization', model='t5-base') classifier_emotions = ['positive', 'neutral', 'negative'] # classifier_model_name = 'bhadresh-savani/distilbert-base-uncased-emotion' # classifier_emotions = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise'] def summarize_sentences(sentences_by_emotion, min_length, max_length): for k in sentences_by_emotion.keys(): if (len(sentences_by_emotion[k])!=0): text = ' '.join(sentences_by_emotion[k]) summary = summarizer(text, min_length=min_length, max_length=max_length) print(f"{k.upper()}: {summary[0]['summary_text']}\n") def chunk_text_to_window_size_and_predict_proba(input_ids, attention_mask, total_len): """ This function splits the given input text into chunks of a specified window length, applies transformer model to each chunk and computes probabilities of each class for each chunk. The computed probabilities are then appended to a list. Args: input_ids (List[int]): List of token ids representing the input text. attention_mask (List[int]): List of attention masks corresponding to input_ids. total_len (int): Total length of the input_ids. Returns: proba_list (List[torch.Tensor]): List of probability tensors for each chunk. """ proba_list = [] start = 0 window_length = 510 loop = True count = 1 print(f'Total Length: {total_len}') while loop: end = start + window_length # If the end index exceeds total length, set the flag to False and adjust the end index if (end >= total_len) or (count >= 25): loop = False print(f'Start: {start}') print(f'End: {end}') # 1 => Define the text chunk input_ids_chunk = input_ids[start : end] attention_mask_chunk = attention_mask[start : end] # 2 => Append [CLS] and [SEP] input_ids_chunk = [101] + input_ids_chunk + [102] attention_mask_chunk = [1] + attention_mask_chunk + [1] #3 Convert regular python list to Pytorch Tensor input_dict = { 'input_ids' : torch.Tensor([input_ids_chunk]).long(), 'attention_mask' : torch.Tensor([attention_mask_chunk]).int() } outputs = model(**input_dict) decoded = tokenizer.decode(input_ids_chunk) print(f'Loop Count: {count}') count = count + 1 print("########:", decoded , ":##############") probabilities = torch.nn.functional.softmax(outputs[0], dim = -1) print("########:", probabilities , ":##############") proba_list.append(probabilities) start = end return proba_list def get_mean_from_proba(proba_list): """ This function computes the mean probabilities of class predictions over all the chunks. Args: proba_list (List[torch.Tensor]): List of probability tensors for each chunk. Returns: mean (torch.Tensor): Mean of the probabilities across all chunks. """ # Ensures that gradients are not computed, saving memory with torch.no_grad(): # Stack the list of tensors into a single tensor stacks = torch.stack(proba_list) # Resize the tensor to match the dimensions needed for mean computation stacks = stacks.resize(stacks.shape[0], stacks.shape[2]) # Compute the mean along the zeroth dimension (i.e., the chunk dimension) mean = stacks.mean(dim = 0) return mean input_ids = tokens['input_ids'] total_len = len(input_ids) attention_mask = tokens['attention_mask'] proba_list = chunk_text_to_window_size_and_predict_proba(input_ids, attention_mask, total_len ) mean = get_mean_from_proba(proba_list) sentiment = torch.argmax(mean).item() if sentiment == 0: return "Positive Sentiment" elif sentiment == 1: return "Negative Sentiment" else: return "Neutral" gr_interface = gradio.Interface( fn = my_inference_function, inputs = "text", outputs = "text" ) gr_interface.launch()