Spaces:
Sleeping
Sleeping
import pandas as pd | |
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.model_selection import train_test_split | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import warnings | |
warnings.filterwarnings("ignore", category=FutureWarning) | |
import pickle | |
def preprocess_bag_of_words(preprocessed_text_list): | |
texts = [preprocessed["original_text"] for preprocessed in preprocessed_text_list] | |
vectorizer = CountVectorizer() | |
bag_of_words = vectorizer.fit_transform(texts) | |
bow_df = pd.DataFrame(bag_of_words.toarray(), columns=vectorizer.get_feature_names_out()) | |
return bow_df, vectorizer | |
if __name__ == "__main__": | |
preprocessed_data = pd.read_pickle('preprocessed_deeds.pkl') | |
texts = preprocessed_data['original_text'] | |
preprocessed_text_list = texts.apply(lambda x: {"original_text": x}).tolist() | |
bow_df, vectorizer = preprocess_bag_of_words(preprocessed_text_list) | |
X = bow_df | |
y = preprocessed_data['is_racist'] | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) | |
logistic_model = LogisticRegression(max_iter=1000) | |
logistic_model.fit(X_train, y_train) | |
# Save the model and vectorizer | |
with open('vectorizer.pkl', 'wb') as vec_file: | |
pickle.dump(vectorizer, vec_file) | |
with open('logistic_model.pkl', 'wb') as model_file: | |
pickle.dump(logistic_model, model_file) | |
y_pred = logistic_model.predict(X_test) | |
accuracy = accuracy_score(y_test, y_pred) | |
print(f"Accuracy: {accuracy:.2f}") | |
print("\nClassification Report:") | |
print(classification_report(y_test, y_pred)) | |
conf_matrix = confusion_matrix(y_test, y_pred) | |
plt.figure(figsize=(6, 4)) | |
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Non-racist', 'Racist'], yticklabels=['Non-racist', 'Racist']) | |
plt.title('Confusion Matrix') | |
plt.xlabel('Predicted') | |
plt.ylabel('Actual') | |
plt.show() | |
y_prob = logistic_model.predict_proba(X_test)[:, 1] | |
fpr, tpr, _ = roc_curve(y_test, y_prob) | |
roc_auc = auc(fpr, tpr) | |
plt.figure(figsize=(6, 4)) | |
plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})') | |
plt.plot([0, 1], [0, 1], 'k--') | |
plt.xlim([0.0, 1.0]) | |
plt.ylim([0.0, 1.05]) | |
plt.xlabel('False Positive Rate') | |
plt.ylabel('True Positive Rate') | |
plt.title('Receiver Operating Characteristic (ROC) Curve') | |
plt.legend(loc="lower right") | |
plt.show() | |
feature_importance = pd.Series(logistic_model.coef_[0], index=vectorizer.get_feature_names_out()) | |
top_features = feature_importance.nlargest(10) | |
plt.figure(figsize=(8, 6)) | |
top_features.plot(kind='barh', color='skyblue') | |
plt.title('Top 10 Most Influential Words for Racist Classification') | |
plt.xlabel('Coefficient Value') | |
plt.ylabel('Word') | |
plt.show() | |
# Function to make predictions based on the trained model | |
def predict(processed_text, vectorizer, logistic_model): | |
bow_text = vectorizer.transform([processed_text["original_text"]]) | |
prediction = logistic_model.predict(bow_text) | |
return { | |
'is_racist': bool(prediction[0]), | |
} |