|
import streamlit as st |
|
import torch |
|
import torch.nn as nn |
|
import transformers |
|
from transformers import AutoTokenizer,AutoModel |
|
import numpy as np |
|
import torch.nn as nn |
|
import torch.nn.functional as F |
|
import pandas as pd |
|
import re |
|
teencode_df = pd.read_csv('teencode.txt',names=['teencode','map'],sep='\t',) |
|
teencode_list = teencode_df['teencode'].to_list() |
|
map_list = teencode_df['map'].to_list() |
|
class BCNN(nn.Module): |
|
def __init__(self, embedding_dim, output_dim, |
|
dropout,bidirectional_units,conv_filters): |
|
|
|
super().__init__() |
|
self.bert = AutoModel.from_pretrained('vinai/phobert-base-v2') |
|
|
|
self.bidirectional_lstm = nn.LSTM( |
|
embedding_dim, bidirectional_units, bidirectional=True, batch_first=True |
|
) |
|
self.conv1 = nn.Conv1d(in_channels=2*bidirectional_units, out_channels=conv_filters[0], kernel_size=4) |
|
self.conv2 = nn.Conv1d(in_channels=2*bidirectional_units, out_channels=conv_filters[1], kernel_size=5) |
|
|
|
self.fc = nn.Linear(64, output_dim) |
|
|
|
self.dropout = nn.Dropout(dropout) |
|
|
|
def forward(self,b_input_ids,b_input_mask): |
|
encoded = self.bert(b_input_ids,b_input_mask)[0] |
|
embedded, _ = self.bidirectional_lstm(encoded) |
|
embedded = embedded.permute(0, 2, 1) |
|
conved_1 = F.relu(self.conv1(embedded)) |
|
conved_2 = F.relu(self.conv2(embedded)) |
|
|
|
|
|
pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2) |
|
pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2) |
|
|
|
|
|
cat = self.dropout(torch.cat((pooled_1, pooled_2), dim = 1)) |
|
|
|
|
|
|
|
result = self.fc(cat) |
|
|
|
return result |
|
|
|
class TextClassificationApp: |
|
def __init__(self, model_path, class_names, model_name='vinai/phobert-base-v2'): |
|
""" |
|
Initialize Streamlit Text Classification App |
|
|
|
Args: |
|
model_path (str): Path to the pre-trained .pt model file |
|
class_names (list): List of classification labels |
|
model_name (str): Hugging Face model name for tokenization |
|
""" |
|
|
|
|
|
|
|
st.set_page_config( |
|
page_title="⚖️ Text Justice Classifier", |
|
page_icon="⚖️", |
|
layout="wide" |
|
) |
|
|
|
|
|
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
|
|
|
|
|
self.tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
|
|
EMBEDDING_DIM = 768 |
|
OUTPUT_DIM = 2 |
|
DROPOUT = 0.1 |
|
CONV_FILTERS = [32, 32] |
|
BIDIRECTIONAL_UNITS = 128 |
|
self.model = BCNN(EMBEDDING_DIM, OUTPUT_DIM, DROPOUT, BIDIRECTIONAL_UNITS, CONV_FILTERS) |
|
self.model = torch.load(r'toxic.pt',map_location=torch.device('cpu')) |
|
self.model.eval() |
|
|
|
|
|
self.class_names = class_names |
|
|
|
|
|
self.max_length = 128 |
|
|
|
def remove_dub_char(self, sentence): |
|
sentence = str(sentence) |
|
words = [] |
|
for word in sentence.strip().split(): |
|
if word in teencode_list: |
|
words.append(word) |
|
continue |
|
words.append(re.sub(r'([A-Z])\1+', lambda m: m.group(1), word, flags = re.IGNORECASE)) |
|
return ' '.join(words) |
|
|
|
def preprocess_text(self, text): |
|
""" |
|
Preprocess input text for model prediction |
|
|
|
Args: |
|
text (str): Input text to classify |
|
|
|
Returns: |
|
torch.Tensor: Tokenized and encoded input |
|
""" |
|
|
|
text = self.remove_dub_char(text) |
|
input_ids = [] |
|
attention_masks = [] |
|
encoded = self.tokenizer.encode_plus( |
|
text, |
|
add_special_tokens=True, |
|
max_length=self.max_length, |
|
padding='max_length', |
|
truncation=True, |
|
return_tensors='pt' |
|
) |
|
input_ids.append(encoded['input_ids'].to(self.device)) |
|
attention_masks.append(encoded['attention_mask'].to(self.device)) |
|
input_ids = torch.cat(input_ids, dim=0).to(self.device) |
|
attention_masks = torch.cat(attention_masks, dim=0).to(self.device) |
|
return input_ids, attention_masks |
|
|
|
def predict(self, text): |
|
""" |
|
Make prediction on the input text |
|
|
|
Args: |
|
text (str): Input text to classify |
|
|
|
Returns: |
|
tuple: (predicted class, probabilities) |
|
""" |
|
|
|
inputs,mask = self.preprocess_text(text) |
|
|
|
|
|
with torch.no_grad(): |
|
|
|
outputs = self.model(inputs,mask) |
|
|
|
|
|
probabilities = torch.softmax(outputs, dim=1) |
|
|
|
|
|
top_probs, top_classes = torch.topk(probabilities, k=1) |
|
|
|
return top_classes[0].cpu().numpy(), top_probs[0].cpu().numpy() |
|
|
|
def run(self): |
|
""" |
|
Main Streamlit app runner |
|
""" |
|
|
|
st.title("📄 Text Classification") |
|
st.write("Enter text to classify") |
|
|
|
|
|
text_input = st.text_area( |
|
"Paste your text here", |
|
height=250, |
|
placeholder="Enter the text you want to classify..." |
|
) |
|
|
|
|
|
if st.button("Classify Text"): |
|
if text_input.strip(): |
|
|
|
top_classes, top_probs = self.predict(text_input) |
|
|
|
|
|
st.subheader("Classification Results") |
|
|
|
|
|
cols = st.columns(3) |
|
|
|
for i, (cls, prob) in enumerate(zip(top_classes, top_probs)): |
|
with cols[i]: |
|
st.metric( |
|
label=f"Top {i+1} Prediction", |
|
value=f"{self.class_names[cls]}", |
|
delta=f"{prob:.2%}" |
|
) |
|
|
|
with st.expander("Input Text Details"): |
|
st.write("**Original Text:**") |
|
st.write(text_input) |
|
st.write(f"**Text Length:** {len(text_input)} characters") |
|
else: |
|
st.warning("Please enter some text to classify") |
|
|
|
def main(): |
|
|
|
MODEL_PATH = 'toxic.pt' |
|
CLASS_NAMES = [ |
|
'Non-toxic', |
|
'Toxic' |
|
] |
|
|
|
|
|
app = TextClassificationApp(MODEL_PATH, CLASS_NAMES) |
|
app.run() |
|
|
|
if __name__ == "__main__": |
|
main() |