import gradio as gr |
import torch |
from transformers import AutoTokenizer, AutoModelForSequenceClassification,AutoModel |
import re |
from textblob import TextBlob |
from nltk import pos_tag, word_tokenize |
from nltk.corpus import stopwords |
import emoji |
import string |
import nltk |
from nltk import pos_tag |
from nltk.tokenize import word_tokenize |
from nltk.corpus import stopwords |
import textstat |
import pandas as pd |
from transformers import pipeline |
from torch.utils.data import Dataset, DataLoader |
import torch.nn as nn |
import os |
from dotenv import load_dotenv |
load_dotenv() |
def average_word_length(tweet): |
words = tweet.split() |
return sum(len(word) for word in words) / len(words) |
def lexical_diversity(tweet): |
words = tweet.split() |
unique_words = set(words) |
return len(unique_words) / len(words) |
def count_capital_letters(tweet): |
return sum(1 for char in tweet if char.isupper()) |
def count_words_surrounded_by_colons(tweet): |
pattern = r':(\w+):' |
matches = re.findall(pattern, tweet) |
return len(matches) |
def count_emojis(tweet): |
tweet_with_names = emoji.demojize(tweet) |
return count_words_surrounded_by_colons(tweet_with_names) |
def hashtag_frequency(tweet): |
hashtags = re.findall(r'#\w+', tweet) |
return len(hashtags) |
def mention_frequency(tweet): |
mentions = re.findall(r'@\w+', tweet) |
return len(mentions) |
def count_special_characters(tweet): |
special_characters = [char for char in tweet if char in string.punctuation] |
return len(special_characters) |
def stop_word_frequency(tweet): |
stop_words = set(stopwords.words('english')) |
words = [word for word in tweet.split() if word.lower() in stop_words] |
return len(words) |
nltk.download('punkt') |
nltk.download('averaged_perceptron_tagger') |
nltk.download('stopwords') |
def get_linguistic_features(tweet): |
words = word_tokenize(tweet) |
stop_words = set(stopwords.words('english')) |
filtered_words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words] |
pos_tags = pos_tag(filtered_words) |
noun_count = sum(1 for word, pos in pos_tags if pos.startswith('N')) |
verb_count = sum(1 for word, pos in pos_tags if pos.startswith('V')) |
participle_count = sum(1 for word, pos in pos_tags if pos.startswith('V') and ('ing' in word or 'ed' in word)) |
interjection_count = sum(1 for word, pos in pos_tags if pos == 'UH') |
pronoun_count = sum(1 for word, pos in pos_tags if pos.startswith('PRP')) |
preposition_count = sum(1 for word, pos in pos_tags if pos.startswith('IN')) |
adverb_count = sum(1 for word, pos in pos_tags if pos.startswith('RB')) |
conjunction_count = sum(1 for word, pos in pos_tags if pos.startswith('CC')) |
return { |
'Noun_Count': noun_count, |
'Verb_Count': verb_count, |
'Participle_Count': participle_count, |
'Interjection_Count': interjection_count, |
'Pronoun_Count': pronoun_count, |
'Preposition_Count': preposition_count, |
'Adverb_Count': adverb_count, |
'Conjunction_Count': conjunction_count |
} |
def readability_score(tweet): |
return textstat.flesch_reading_ease(tweet) |
def get_url_frequency(tweet): |
urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', tweet) |
return len(urls) |
def extract_features(tweet): |
features = { |
'Average_Word_Length': average_word_length(tweet), |
'Lexical_Diversity': lexical_diversity(tweet), |
'Capital_Letters_Count': count_capital_letters(tweet), |
'Hashtag_Frequency': hashtag_frequency(tweet), |
'Mention_Frequency': mention_frequency(tweet), |
'count_emojis': count_emojis(tweet), |
'special_chars_count': count_special_characters(tweet), |
'Stop_Word_Frequency': stop_word_frequency(tweet), |
**get_linguistic_features(tweet), |
'Readability_Score': readability_score(tweet), |
'URL_Frequency': get_url_frequency(tweet) |
} |
return features |
def personality_detection(text, threshold=0.05, endpoint= 1.0): |
tokenizer = AutoTokenizer.from_pretrained ("Nasserelsaman/microsoft-finetuned-personality",token=PERSONALITY_TOKEN) |
model = AutoModelForSequenceClassification.from_pretrained ("Nasserelsaman/microsoft-finetuned-personality",token=PERSONALITY_TOKEN) |
with torch.no_grad(): |
inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt") |
outputs = model(**inputs) |
predictions = outputs.logits.squeeze().detach().numpy() |
logits = model(**inputs).logits |
probabilities = torch.sigmoid(logits) |
return [probabilities[0][0].detach().numpy() |
,probabilities[0][1].detach().numpy() |
,probabilities[0][2].detach().numpy() |
,probabilities[0][3].detach().numpy() |
,probabilities[0][4].detach().numpy()] |
def calc_emotion_score(tweet): |
pipe = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-emotion-multilabel-latest", return_all_scores=True ) |
emotions = pipe(tweet)[0] |
for i in emotions: |
print(i) |
return [emotions[0]['score'],emotions[1]['score'],emotions[2]['score'],emotions[3]['score'],emotions[4]['score'],emotions[5]['score'],emotions[6]['score'],emotions[7]['score'],emotions[8]['score'],emotions[9]['score'],emotions[10]['score']] |
def load_model(tweet): |
model_name = "vinai/bertweet-base" |
tokenizer = AutoTokenizer.from_pretrained(model_name) |
inputs = tokenizer(tweet, truncation=True, padding='max_length',max_length=PADDING_MAX_LENGTH,add_special_tokens=True, return_tensors="pt") |
print(inputs) |
emotion_list = calc_emotion_score(tweet) |
print(emotion_list) |
features_list = extract_features(tweet) |
for i in features_list.values(): |
emotion_list.append(i) |
print("emotion + author",emotion_list) |
personality_list = personality_detection(tweet) |
print("personality",personality_list) |
emotion_list.extend(personality_list) |
print("final list",emotion_list) |
inputs['emotion_author_vector'] = torch.tensor([emotion_list]) |
print("final inputs ",inputs) |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") |
class EmotionAuthorGuidedDCLModel(nn.Module): |
def __init__(self,dcl_model:nn.Module,dropout:float=0.5): |
super(EmotionAuthorGuidedDCLModel, self).__init__() |
self.dcl_model = dcl_model |
self.dim = 802 |
self.dropout = nn.Dropout(dropout) |
self.linear = nn.Linear(self.dim, 1) |
for param in self.dcl_model.parameters(): |
param.requires_grad = False |
def forward(self,batch_tokenized): |
input_ids = batch_tokenized['input_ids'] |
attention_mask = batch_tokenized['attention_mask'] |
emotion_vector = batch_tokenized['emotion_author_vector'] |
bert_output = self.dcl_model(input_ids, attention_mask=attention_mask, output_hidden_states=True) |
bert_cls_hidden_state = bert_output[1] |
combined_vector =torch.cat((bert_cls_hidden_state,emotion_vector), 1) |
d_combined_vector=self.dropout(combined_vector) |
linear_output = self.linear(d_combined_vector) |
pred_linear = linear_output.squeeze(1) |
return pred_linear |
checkpoint = { |
"model_state_dict":torch.load("./model.pt",map_location ='cpu') , |
} |
class DCLArchitecture(nn.Module): |
def __init__(self,dropout:float,bert_model_name:str='vinai/bertweet-base'): |
super(DCLArchitecture, self).__init__() |
self.bert = AutoModel.from_pretrained(bert_model_name) |
self.dim = 768 |
self.dense = nn.Linear(self.dim, 1) |
self.dropout = nn.Dropout(dropout) |
def forward(self,batch_tokenized, if_train=False): |
input_ids = batch_tokenized['input_ids'] |
attention_mask = batch_tokenized['attention_mask'] |
bert_output = self.bert(input_ids, attention_mask=attention_mask, output_hidden_states=True) |
bert_cls_hidden_state = bert_output[1] |
torch.cuda.empty_cache() |
if if_train: |
bert_cls_hidden_state_aug = self.dropout(bert_cls_hidden_state) |
bert_cls_hidden_state = torch.cat((bert_cls_hidden_state, bert_cls_hidden_state_aug), dim=1).reshape(-1, self.dim) |
else: |
bert_cls_hidden_state = self.dropout(bert_cls_hidden_state) |
linear_output = self.dense(bert_cls_hidden_state) |
linear_output = linear_output.squeeze(1) |
return bert_cls_hidden_state, linear_output |
dcl_model = DCLArchitecture(bert_model_name=model_name,dropout=0.5) |
dcl_model.to(device) |
DROPOUT = 0.5 |
fined_tuned_bert_model=dcl_model.bert |
model = EmotionAuthorGuidedDCLModel(dcl_model=fined_tuned_bert_model,dropout=DROPOUT) |
model.to(device) |
model.load_state_dict(checkpoint["model_state_dict"]) |
def predict_single_text(model, inputs,device): |
inputs = {k: v.to(device) for k, v in inputs.items()} |
with torch.no_grad(): |
model.eval() |
pred = model(inputs) |
print("prediction ",pred) |
print("sigmoid output",torch.sigmoid(pred)) |
print("sigmoid item",torch.sigmoid(pred).item()) |
pred = torch.round(torch.sigmoid(pred)).item() |
return pred |
predicted_class = predict_single_text(model, inputs, device) |
return predicted_class |
def greet(tweet): |
print("start") |
predicted_class = load_model(tweet) |
print("end") |
return str(predicted_class) |
demo = gr.Interface( |
title = "Unmasking Hate: An Integrated Approach to Detecting Hate Speech in Social Media", |
fn=greet, inputs=gr.Textbox(placeholder="Enter an input sentence...",label="Input Sentence"), |
allow_flagging = "never",outputs="text") |
demo.launch() |