Spaces:

zerostratos
/

toxic_classification_model

Sleeping

App Files Files Community

toxic_classification_model / app.py

zerostratos

Update app.py

8fc3e0c verified about 2 months ago

raw

history blame contribute delete

7.38 kB

	import streamlit as st
	import torch
	import torch.nn as nn
	import transformers
	from transformers import AutoTokenizer,AutoModel
	import numpy as np
	import torch.nn as nn
	import torch.nn.functional as F
	import pandas as pd
	import re
	teencode_df = pd.read_csv('teencode.txt',names=['teencode','map'],sep='\t',)
	teencode_list = teencode_df['teencode'].to_list()
	map_list = teencode_df['map'].to_list()
	class BCNN(nn.Module):
	def __init__(self, embedding_dim, output_dim,
	dropout,bidirectional_units,conv_filters):

	super().__init__()
	self.bert = AutoModel.from_pretrained('vinai/phobert-base-v2')
	#.fc_input = nn.Linear(embedding_dim,embedding_dim)
	self.bidirectional_lstm = nn.LSTM(
	embedding_dim, bidirectional_units, bidirectional=True, batch_first=True
	)
	self.conv1 = nn.Conv1d(in_channels=2*bidirectional_units, out_channels=conv_filters[0], kernel_size=4)
	self.conv2 = nn.Conv1d(in_channels=2*bidirectional_units, out_channels=conv_filters[1], kernel_size=5)

	self.fc = nn.Linear(64, output_dim)

	self.dropout = nn.Dropout(dropout)

	def forward(self,b_input_ids,b_input_mask):
	encoded = self.bert(b_input_ids,b_input_mask)[0]
	embedded, _ = self.bidirectional_lstm(encoded)
	embedded = embedded.permute(0, 2, 1)
	conved_1 = F.relu(self.conv1(embedded))
	conved_2 = F.relu(self.conv2(embedded))
	#conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]

	pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
	pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
	#pooled_n = [batch size, n_fibatlters]

	cat = self.dropout(torch.cat((pooled_1, pooled_2), dim = 1))

	#cat = [batch size, n_filters * len(filter_sizes)]

	result = self.fc(cat)

	return result

	class TextClassificationApp:
	def __init__(self, model_path, class_names, model_name='vinai/phobert-base-v2'):
	"""
	Initialize Streamlit Text Classification App

	Args:
	model_path (str): Path to the pre-trained .pt model file
	class_names (list): List of classification labels
	model_name (str): Hugging Face model name for tokenization
	"""
	# Set up Streamlit page
	# Custom CSS for justice-themed design
	# Streamlit page configuration
	st.set_page_config(
	page_title="⚖️ Text Justice Classifier",
	page_icon="⚖️",
	layout="wide"
	)

	# Device configuration
	self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

	# Load tokenizer
	self.tokenizer = AutoTokenizer.from_pretrained(model_name)

	# Load the model
	EMBEDDING_DIM = 768
	OUTPUT_DIM = 2
	DROPOUT = 0.1
	CONV_FILTERS = [32, 32] # Number of filters for each kernel size (4 and 5)
	BIDIRECTIONAL_UNITS = 128
	self.model = BCNN(EMBEDDING_DIM, OUTPUT_DIM, DROPOUT, BIDIRECTIONAL_UNITS, CONV_FILTERS)
	self.model = torch.load(r'toxic.pt',map_location=torch.device('cpu'))
	self.model.eval() # Set to evaluation mode

	# Store class names
	self.class_names = class_names

	# Maximum sequence length
	self.max_length = 128

	def remove_dub_char(self, sentence):
	sentence = str(sentence)
	words = []
	for word in sentence.strip().split():
	if word in teencode_list:
	words.append(word)
	continue
	words.append(re.sub(r'([A-Z])\1+', lambda m: m.group(1), word, flags = re.IGNORECASE))
	return ' '.join(words)

	def preprocess_text(self, text):
	"""
	Preprocess input text for model prediction

	Args:
	text (str): Input text to classify

	Returns:
	torch.Tensor: Tokenized and encoded input
	"""
	# Tokenize and encode the text
	text = self.remove_dub_char(text)
	input_ids = []
	attention_masks = []
	encoded = self.tokenizer.encode_plus(
	text,
	add_special_tokens=True,
	max_length=self.max_length,
	padding='max_length',
	truncation=True,
	return_tensors='pt'
	)
	input_ids.append(encoded['input_ids'].to(self.device))
	attention_masks.append(encoded['attention_mask'].to(self.device))
	input_ids = torch.cat(input_ids, dim=0).to(self.device)
	attention_masks = torch.cat(attention_masks, dim=0).to(self.device)
	return input_ids, attention_masks

	def predict(self, text):
	"""
	Make prediction on the input text

	Args:
	text (str): Input text to classify

	Returns:
	tuple: (predicted class, probabilities)
	"""
	# Preprocess the text
	inputs,mask = self.preprocess_text(text)

	# Disable gradient calculation
	with torch.no_grad():
	# Get model outputs
	outputs = self.model(inputs,mask)

	# Apply softmax to get probabilities
	probabilities = torch.softmax(outputs, dim=1)

	# Get top predictions
	top_probs, top_classes = torch.topk(probabilities, k=1)

	return top_classes[0].cpu().numpy(), top_probs[0].cpu().numpy()

	def run(self):
	"""
	Main Streamlit app runner
	"""
	# Title and description
	st.title("📄 Text Classification")
	st.write("Enter text to classify")

	# Text input
	text_input = st.text_area(
	"Paste your text here",
	height=250,
	placeholder="Enter the text you want to classify..."
	)

	# Prediction button
	if st.button("Classify Text"):
	if text_input.strip():
	# Make prediction
	top_classes, top_probs = self.predict(text_input)

	# Display results
	st.subheader("Classification Results")

	# Create columns for results
	cols = st.columns(3)

	for i, (cls, prob) in enumerate(zip(top_classes, top_probs)):
	with cols[i]:
	st.metric(
	label=f"Top {i+1} Prediction",
	value=f"{self.class_names[cls]}",
	delta=f"{prob:.2%}"
	)
	# Show input text details
	with st.expander("Input Text Details"):
	st.write("Original Text:")
	st.write(text_input)
	st.write(f"Text Length: {len(text_input)} characters")
	else:
	st.warning("Please enter some text to classify")

	def main():
	# Replace these with your actual model path and class names
	MODEL_PATH = 'toxic.pt'
	CLASS_NAMES = [
	'Non-toxic',
	'Toxic'
	]

	# Initialize and run the app
	app = TextClassificationApp(MODEL_PATH, CLASS_NAMES)
	app.run()

	if __name__ == "__main__":
	main()