|
import gradio |
|
import torch |
|
|
|
from transformers import pipeline |
|
from transformers import BertForSequenceClassification, BertTokenizer |
|
|
|
|
|
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert') |
|
|
|
model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert') |
|
|
|
summarizer = pipeline('summarization', model='t5-base') |
|
|
|
classifier_emotions = ['positive', 'neutral', 'negative'] |
|
|
|
|
|
|
|
def summarize_sentences(sentences_by_emotion, min_length, max_length): |
|
for k in sentences_by_emotion.keys(): |
|
if (len(sentences_by_emotion[k])!=0): |
|
text = ' '.join(sentences_by_emotion[k]) |
|
summary = summarizer(text, min_length=min_length, max_length=max_length) |
|
print(f"{k.upper()}: {summary[0]['summary_text']}\n") |
|
|
|
|
|
def chunk_text_to_window_size_and_predict_proba(input_ids, attention_mask, total_len): |
|
""" |
|
This function splits the given input text into chunks of a specified window length, |
|
applies transformer model to each chunk and computes probabilities of each class for each chunk. |
|
The computed probabilities are then appended to a list. |
|
|
|
Args: |
|
input_ids (List[int]): List of token ids representing the input text. |
|
attention_mask (List[int]): List of attention masks corresponding to input_ids. |
|
total_len (int): Total length of the input_ids. |
|
|
|
Returns: |
|
proba_list (List[torch.Tensor]): List of probability tensors for each chunk. |
|
""" |
|
proba_list = [] |
|
|
|
start = 0 |
|
window_length = 510 |
|
|
|
loop = True |
|
count = 1 |
|
print(f'Total Length: {total_len}') |
|
|
|
while loop: |
|
end = start + window_length |
|
|
|
if (end >= total_len) or (count >= 25): |
|
loop = False |
|
|
|
|
|
print(f'Start: {start}') |
|
print(f'End: {end}') |
|
|
|
|
|
input_ids_chunk = input_ids[start : end] |
|
attention_mask_chunk = attention_mask[start : end] |
|
|
|
|
|
input_ids_chunk = [101] + input_ids_chunk + [102] |
|
attention_mask_chunk = [1] + attention_mask_chunk + [1] |
|
|
|
|
|
input_dict = { |
|
'input_ids' : torch.Tensor([input_ids_chunk]).long(), |
|
'attention_mask' : torch.Tensor([attention_mask_chunk]).int() |
|
} |
|
|
|
outputs = model(**input_dict) |
|
|
|
decoded = tokenizer.decode(input_ids_chunk) |
|
print(f'Loop Count: {count}') |
|
count = count + 1 |
|
print("########:", decoded , ":##############") |
|
|
|
probabilities = torch.nn.functional.softmax(outputs[0], dim = -1) |
|
print("########:", probabilities , ":##############") |
|
proba_list.append(probabilities) |
|
start = end |
|
|
|
return proba_list |
|
|
|
def get_mean_from_proba(proba_list): |
|
""" |
|
This function computes the mean probabilities of class predictions over all the chunks. |
|
|
|
Args: |
|
proba_list (List[torch.Tensor]): List of probability tensors for each chunk. |
|
|
|
Returns: |
|
mean (torch.Tensor): Mean of the probabilities across all chunks. |
|
""" |
|
|
|
|
|
with torch.no_grad(): |
|
|
|
stacks = torch.stack(proba_list) |
|
|
|
|
|
stacks = stacks.resize(stacks.shape[0], stacks.shape[2]) |
|
|
|
|
|
mean = stacks.mean(dim = 0) |
|
|
|
return mean |
|
|
|
|
|
input_ids = tokens['input_ids'] |
|
total_len = len(input_ids) |
|
attention_mask = tokens['attention_mask'] |
|
|
|
proba_list = chunk_text_to_window_size_and_predict_proba(input_ids, attention_mask, total_len ) |
|
mean = get_mean_from_proba(proba_list) |
|
sentiment = torch.argmax(mean).item() |
|
|
|
if sentiment == 0: |
|
return "Positive Sentiment" |
|
elif sentiment == 1: |
|
return "Negative Sentiment" |
|
else: |
|
return "Neutral" |
|
|
|
gr_interface = gradio.Interface( |
|
fn = my_inference_function, |
|
inputs = "text", |
|
outputs = "text" |
|
) |
|
|
|
gr_interface.launch() |
|
|