File size: 4,462 Bytes
29c5704
b3301a4
37fc349
47f38d5
37fc349
 
b3301a4
37fc349
47f38d5
37fc349
062e6a2
b9a198e
47f38d5
b9a198e
47f38d5
 
 
b9a198e
 
 
 
 
 
37fc349
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b9a198e
37fc349
 
 
 
aa47040
6469f49
 
37fc349
 
 
ffe7a35
37fc349
9eb6d05
37fc349
6469f49
 
 
37fc349
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b9a198e
 
215e416
aa47040
b9a198e
37fc349
 
567a866
37fc349
dcabcbb
286dd08
 
37fc349
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71e2445
b9a198e
37fc349
71e2445
37fc349
 
 
 
e483bf1
 
 
 
 
 
 
 
37fc349
29c5704
 
7af1a44
 
a11ee17
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import gradio
import torch

from transformers import pipeline
from transformers import BertForSequenceClassification, BertTokenizer


tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')

model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')

summarizer = pipeline('summarization', model='t5-base')

classifier_emotions = ['positive', 'neutral', 'negative']
# classifier_model_name = 'bhadresh-savani/distilbert-base-uncased-emotion'
# classifier_emotions = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']

def summarize_sentences(sentences_by_emotion, min_length, max_length):
    for k in sentences_by_emotion.keys():
        if (len(sentences_by_emotion[k])!=0):
            text = ' '.join(sentences_by_emotion[k])
            summary = summarizer(text, min_length=min_length, max_length=max_length)
            print(f"{k.upper()}: {summary[0]['summary_text']}\n")


def chunk_text_to_window_size_and_predict_proba(input_ids, attention_mask, total_len):
    """
    This function splits the given input text into chunks of a specified window length, 
    applies transformer model to each chunk and computes probabilities of each class for each chunk. 
    The computed probabilities are then appended to a list.

    Args:
        input_ids (List[int]): List of token ids representing the input text.
        attention_mask (List[int]): List of attention masks corresponding to input_ids.
        total_len (int): Total length of the input_ids.

    Returns:
        proba_list (List[torch.Tensor]): List of probability tensors for each chunk.
    """
    proba_list = []

    start = 0
    window_length = 510
    
    loop = True
    count = 1
    print(f'Total Length: {total_len}')
    
    while loop:
        end = start  + window_length
        # If the end index exceeds total length, set the flag to False and adjust the end index
        if (end >= total_len) or (count >= 25):
            loop = False
        

        print(f'Start: {start}')
        print(f'End: {end}')
    
        # 1 => Define the text chunk
        input_ids_chunk = input_ids[start : end]
        attention_mask_chunk = attention_mask[start : end]
        
        # 2 => Append [CLS] and [SEP]
        input_ids_chunk = [101] + input_ids_chunk + [102]
        attention_mask_chunk = [1] + attention_mask_chunk + [1]
        
        #3 Convert regular python list to Pytorch Tensor
        input_dict = {
            'input_ids' : torch.Tensor([input_ids_chunk]).long(),
            'attention_mask' : torch.Tensor([attention_mask_chunk]).int()
        }
        
        outputs = model(**input_dict)

        decoded = tokenizer.decode(input_ids_chunk)
        print(f'Loop Count: {count}')
        count = count + 1
        print("########:", decoded , ":##############")
        
        probabilities = torch.nn.functional.softmax(outputs[0], dim = -1)
        print("########:", probabilities , ":##############")
        proba_list.append(probabilities)
        start = end
    
    return proba_list

def get_mean_from_proba(proba_list):
    """
    This function computes the mean probabilities of class predictions over all the chunks.

    Args:
        proba_list (List[torch.Tensor]): List of probability tensors for each chunk.

    Returns:
        mean (torch.Tensor): Mean of the probabilities across all chunks.
    """
    
    # Ensures that gradients are not computed, saving memory
    with torch.no_grad():
        # Stack the list of tensors into a single tensor
        stacks = torch.stack(proba_list)

        # Resize the tensor to match the dimensions needed for mean computation
        stacks = stacks.resize(stacks.shape[0], stacks.shape[2])

        # Compute the mean along the zeroth dimension (i.e., the chunk dimension)
        mean = stacks.mean(dim = 0)
        
    return mean
    
    
    input_ids = tokens['input_ids']
    total_len = len(input_ids)
    attention_mask = tokens['attention_mask']
    
    proba_list = chunk_text_to_window_size_and_predict_proba(input_ids, attention_mask, total_len )
    mean = get_mean_from_proba(proba_list)
    sentiment = torch.argmax(mean).item()

    if sentiment == 0:
        return "Positive Sentiment"
    elif sentiment == 1:
        return "Negative Sentiment"
    else:
        return "Neutral"
    
gr_interface = gradio.Interface(
    fn = my_inference_function,
    inputs = "text",
    outputs = "text"
)

gr_interface.launch()