Spaces:

Himanshusingh
/

sec_sentiment_analysis

Running

File size: 4,462 Bytes

import gradio
import torch

from transformers import pipeline
from transformers import BertForSequenceClassification, BertTokenizer


tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')

model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')

summarizer = pipeline('summarization', model='t5-base')

classifier_emotions = ['positive', 'neutral', 'negative']
# classifier_model_name = 'bhadresh-savani/distilbert-base-uncased-emotion'
# classifier_emotions = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']

def summarize_sentences(sentences_by_emotion, min_length, max_length):
    for k in sentences_by_emotion.keys():
        if (len(sentences_by_emotion[k])!=0):
            text = ' '.join(sentences_by_emotion[k])
            summary = summarizer(text, min_length=min_length, max_length=max_length)
            print(f"{k.upper()}: {summary[0]['summary_text']}\n")


def chunk_text_to_window_size_and_predict_proba(input_ids, attention_mask, total_len):
    """
    This function splits the given input text into chunks of a specified window length, 
    applies transformer model to each chunk and computes probabilities of each class for each chunk. 
    The computed probabilities are then appended to a list.

    Args:
        input_ids (List[int]): List of token ids representing the input text.
        attention_mask (List[int]): List of attention masks corresponding to input_ids.
        total_len (int): Total length of the input_ids.

    Returns:
        proba_list (List[torch.Tensor]): List of probability tensors for each chunk.
    """
    proba_list = []

    start = 0
    window_length = 510
    
    loop = True
    count = 1
    print(f'Total Length: {total_len}')
    
    while loop:
        end = start  + window_length
        # If the end index exceeds total length, set the flag to False and adjust the end index
        if (end >= total_len) or (count >= 25):
            loop = False
        

        print(f'Start: {start}')
        print(f'End: {end}')
    
        # 1 => Define the text chunk
        input_ids_chunk = input_ids[start : end]
        attention_mask_chunk = attention_mask[start : end]
        
        # 2 => Append [CLS] and [SEP]
        input_ids_chunk = [101] + input_ids_chunk + [102]
        attention_mask_chunk = [1] + attention_mask_chunk + [1]
        
        #3 Convert regular python list to Pytorch Tensor
        input_dict = {
            'input_ids' : torch.Tensor([input_ids_chunk]).long(),
            'attention_mask' : torch.Tensor([attention_mask_chunk]).int()
        }
        
        outputs = model(**input_dict)

        decoded = tokenizer.decode(input_ids_chunk)
        print(f'Loop Count: {count}')
        count = count + 1
        print("########:", decoded , ":##############")
        
        probabilities = torch.nn.functional.softmax(outputs[0], dim = -1)
        print("########:", probabilities , ":##############")
        proba_list.append(probabilities)
        start = end
    
    return proba_list

def get_mean_from_proba(proba_list):
    """
    This function computes the mean probabilities of class predictions over all the chunks.

    Args:
        proba_list (List[torch.Tensor]): List of probability tensors for each chunk.

    Returns:
        mean (torch.Tensor): Mean of the probabilities across all chunks.
    """
    
    # Ensures that gradients are not computed, saving memory
    with torch.no_grad():
        # Stack the list of tensors into a single tensor
        stacks = torch.stack(proba_list)

        # Resize the tensor to match the dimensions needed for mean computation
        stacks = stacks.resize(stacks.shape[0], stacks.shape[2])

        # Compute the mean along the zeroth dimension (i.e., the chunk dimension)
        mean = stacks.mean(dim = 0)
        
    return mean
    
    
    input_ids = tokens['input_ids']
    total_len = len(input_ids)
    attention_mask = tokens['attention_mask']
    
    proba_list = chunk_text_to_window_size_and_predict_proba(input_ids, attention_mask, total_len )
    mean = get_mean_from_proba(proba_list)
    sentiment = torch.argmax(mean).item()

    if sentiment == 0:
        return "Positive Sentiment"
    elif sentiment == 1:
        return "Negative Sentiment"
    else:
        return "Neutral"
    
gr_interface = gradio.Interface(
    fn = my_inference_function,
    inputs = "text",
    outputs = "text"
)

gr_interface.launch()