import streamlit as st import zipfile import os import requests import re from keras.models import load_model from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from sklearn.preprocessing import LabelEncoder import pickle import numpy as np from PIL import Image from joblib import load import math # Custom headers for the HTTP request headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3', } #################### Load the banner image ########## # Fetch the image from the URL banner_image_request = requests.get("https://jaifar.net/ADS/banner.jpg", headers=headers) # Save the downloaded content banner_image_path = "banner.jpg" with open(banner_image_path, "wb") as f: f.write(banner_image_request.content) # Open the image banner_image = Image.open(banner_image_path) # Display the image using streamlit st.image(banner_image, caption='', use_column_width=True) ################ end loading banner image ################## def get_author_display_name(predicted_author, ridge_prediction, extra_trees_prediction): author_map = { "googlebard": "Google Bard", "gpt3": "ChatGPT-3", "gpt4": "ChatGPT-4", "huggingface": "HuggingChat", "human": "Human-Written" } cnn_predicted_author_display_name = author_map.get(predicted_author, predicted_author) ridge_predicted_author_display_name = author_map.get(ridge_prediction[0], ridge_prediction[0]) extra_trees_predicted_author_display_name = author_map.get(extra_trees_prediction[0], extra_trees_prediction[0]) return cnn_predicted_author_display_name, ridge_predicted_author_display_name, extra_trees_predicted_author_display_name ############# Download Or Check Files/folders exeistince ############## # Check if the model folder exists zip_file_path = "my_authorship_model_zip.zip" if not os.path.exists('my_authorship_model'): try: # Download the model model_url = 'https://jaifar.net/ADS/my_authorship_model_zip.zip' r = requests.get(model_url, headers=headers) r.raise_for_status() # Debugging: Check if download is successful by examining content length # st.write(f"Downloaded model size: {len(r.content)} bytes") # Save the downloaded content with open(zip_file_path, "wb") as f: f.write(r.content) # Debugging: Verify that the zip file exists if os.path.exists(zip_file_path): # st.write("Zip file exists") # Extract the model using zipfile with zipfile.ZipFile(zip_file_path, 'r') as zip_ref: zip_ref.extractall('my_authorship_model') # # Debugging: Check if the folder is successfully created # if os.path.exists('my_authorship_model'): # # st.write("Model folder successfully extracted using zipfile") # # Debugging: List the directory contents after extraction # # st.write("Listing directory contents:") # # st.write(os.listdir('.')) # else: # st.write("Model folder was not extracted successfully using zipfile") # exit(1) else: st.write("Zip file does not exist") exit(1) except Exception as e: st.write(f"Failed to download or extract the model: {e}") exit(1) else: st.write("Version: 1.0") # Download the required files file_urls = { 'tokenizer.pkl': 'https://jaifar.net/ADS/tokenizer.pkl', 'label_encoder.pkl': 'https://jaifar.net/ADS/label_encoder.pkl' } for filename, url in file_urls.items(): if not os.path.exists(filename): # Check if the file doesn't exist try: r = requests.get(url, headers=headers) r.raise_for_status() with open(filename, 'wb') as f: f.write(r.content) except Exception as e: st.write(f"Failed to download {filename}: {e}") exit(1) # else: # st.write(f"File {filename} already exists. Skipping download.") ############ download ridge and ExtraTree stuff # def has_internet_connection(): # try: # response = requests.get("https://www.google.com/", timeout=5) # return True # except requests.ConnectionError: # return False def is_zip_file(file_path): return zipfile.is_zipfile(file_path) def are_files_extracted(extracted_files, missing_files): for file in missing_files: if file not in extracted_files: return False return True def check_and_download_files(): file_names = [ "truncated_260_to_284.xlsx_vectorizer.pkl", "not_trancated_full_paragraph.xlsx_extra_trees_model.pkl", "not_trancated_full_paragraph.xlsx_ridge_model.pkl", "not_trancated_full_paragraph.xlsx_vectorizer.pkl", "truncated_10_to_34.xlsx_extra_trees_model.pkl", "truncated_10_to_34.xlsx_ridge_model.pkl", "truncated_10_to_34.xlsx_vectorizer.pkl", "truncated_35_to_59.xlsx_extra_trees_model.pkl", "truncated_35_to_59.xlsx_ridge_model.pkl", "truncated_35_to_59.xlsx_vectorizer.pkl", "truncated_60_to_84.xlsx_extra_trees_model.pkl", "truncated_60_to_84.xlsx_ridge_model.pkl", "truncated_60_to_84.xlsx_vectorizer.pkl", "truncated_85_to_109.xlsx_extra_trees_model.pkl", "truncated_85_to_109.xlsx_ridge_model.pkl", "truncated_85_to_109.xlsx_vectorizer.pkl", "truncated_110_to_134.xlsx_extra_trees_model.pkl", "truncated_110_to_134.xlsx_ridge_model.pkl", "truncated_110_to_134.xlsx_vectorizer.pkl", "truncated_135_to_159.xlsx_extra_trees_model.pkl", "truncated_135_to_159.xlsx_ridge_model.pkl", "truncated_135_to_159.xlsx_vectorizer.pkl", "truncated_160_to_184.xlsx_extra_trees_model.pkl", "truncated_160_to_184.xlsx_ridge_model.pkl", "truncated_160_to_184.xlsx_vectorizer.pkl", "truncated_185_to_209.xlsx_extra_trees_model.pkl", "truncated_185_to_209.xlsx_ridge_model.pkl", "truncated_185_to_209.xlsx_vectorizer.pkl", "truncated_210_to_234.xlsx_extra_trees_model.pkl", "truncated_210_to_234.xlsx_ridge_model.pkl", "truncated_210_to_234.xlsx_vectorizer.pkl", "truncated_235_to_259.xlsx_extra_trees_model.pkl", "truncated_235_to_259.xlsx_ridge_model.pkl", "truncated_235_to_259.xlsx_vectorizer.pkl", "truncated_260_to_284.xlsx_extra_trees_model.pkl", "truncated_260_to_284.xlsx_ridge_model.pkl" ] missing_files = [] for file_name in file_names: if not os.path.exists(file_name): missing_files.append(file_name) if missing_files: #st.write("The following files are missing:") st.write("Some files are missing") # for file_name in missing_files: # st.write(file_name) # if not has_internet_connection(): # st.write("No internet connection. Cannot download missing files.") # return try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3', } url = 'https://jaifar.net/ADS/content.zip' response = requests.get(url, headers=headers) response.raise_for_status() with open('content.zip', 'wb') as zip_file: zip_file.write(response.content) if not is_zip_file('content.zip'): st.write("Downloaded content is not a ZIP file.") return with zipfile.ZipFile('content.zip', 'r') as zip_ref: zip_ref.extractall() extracted_files = os.listdir() if not are_files_extracted(extracted_files, missing_files): st.write("Not all missing files were extracted.") return st.write("content.zip downloaded and extracted successfully.") except Exception as e: st.write(f"Error downloading or extracting content.zip: {e}") # else: # st.write("All files exist.") check_and_download_files() ############### Load CNN Model ############ # Load the saved model loaded_model = load_model("my_authorship_model") # Load the saved tokenizer and label encoder with open('tokenizer.pkl', 'rb') as handle: tokenizer = pickle.load(handle) with open('label_encoder.pkl', 'rb') as handle: label_encoder = pickle.load(handle) max_length = 300 ############### End Load CNN Model ############ # Function to predict author for new text def predict_author(new_text, model, tokenizer, label_encoder): sequence = tokenizer.texts_to_sequences([new_text]) padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post') prediction = model.predict(padded_sequence) predicted_label = label_encoder.inverse_transform([prediction.argmax()])[0] probabilities = prediction[0] author_probabilities = {} for idx, prob in enumerate(probabilities): author = label_encoder.inverse_transform([idx])[0] author_probabilities[author] = prob return predicted_label, author_probabilities new_text = st.text_area("Input Your Text Here:") # Creates a button named 'Press me' press_me_button = st.button("Human or Robot?") if press_me_button: ########## ML word_count = len(re.findall(r'\w+', new_text)) st.write(f"Words Count: {word_count}") # Choose the appropriate model based on word count if 10 <= word_count <= 34: file_prefix = 'truncated_10_to_34.xlsx' elif 35 <= word_count <= 59: file_prefix = 'truncated_35_to_59.xlsx' elif 60 <= word_count <= 84: file_prefix = 'truncated_60_to_84.xlsx' elif 85 <= word_count <= 109: file_prefix = 'truncated_85_to_109.xlsx' elif 110 <= word_count <= 134: file_prefix = 'truncated_110_to_134.xlsx' elif 135 <= word_count <= 159: file_prefix = 'truncated_135_to_159.xlsx' elif 160 <= word_count <= 184: file_prefix = 'truncated_160_to_184.xlsx' elif 185 <= word_count <= 209: file_prefix = 'truncated_185_to_209.xlsx' elif 210 <= word_count <= 234: file_prefix = 'truncated_210_to_234.xlsx' elif 235 <= word_count <= 259: file_prefix = 'truncated_235_to_259.xlsx' elif 260 <= word_count <= 284: file_prefix = 'truncated_260_to_284.xlsx' else: file_prefix = 'not_trancated_full_paragraph.xlsx' # Load the models and vectorizer with open(f"{file_prefix}_ridge_model.pkl", 'rb') as file: ridge_model = pickle.load(file) with open(f"{file_prefix}_extra_trees_model.pkl", 'rb') as file: extra_trees_model = pickle.load(file) with open(f"{file_prefix}_vectorizer.pkl", 'rb') as file: vectorizer = pickle.load(file) # ML Vectorizing the input user_input_transformed = vectorizer.transform([new_text]) # ML predictions ridge_prediction = ridge_model.predict(user_input_transformed) extra_trees_prediction = extra_trees_model.predict(user_input_transformed) # CNN prediction + Vectorizing the input predicted_author, author_probabilities = predict_author(new_text, loaded_model, tokenizer, label_encoder) sorted_probabilities = sorted(author_probabilities.items(), key=lambda x: x[1], reverse=True) author_map = { "googlebard": "Google Bard", "gpt3": "ChatGPT-3", "gpt4": "ChatGPT-4", "huggingface": "HuggingChat", "human": "Human-Written" } cnn_name, ridge_name, extra_trees_name = get_author_display_name(predicted_author, ridge_prediction, extra_trees_prediction) with st.expander("Prediction Details (Click Here)..."): st.write(f"Ridge: {ridge_name}") st.write(f"ExtraTree: {extra_trees_name}") st.write(f"CNN: {cnn_name}") st.write("_" * 10) st.write("CNN Prediction Probabilities:") for author, prob in sorted_probabilities: display_name = author_map.get(author, author) st.write(f"{display_name}: {prob * 100:.2f}%") st.progress(float(prob)) max_cnn_prob_name = sorted_probabilities[0][0] max_cnn_prob = float(sorted_probabilities[0][1]) if word_count < 10.0 or word_count > 1081.0: st.warning("For better prediction input texts between 10 and 1081", icon="ℹ️") elif word_count < 256: if ridge_prediction == extra_trees_prediction == predicted_author: st.success(f"Most likely written by: **{cnn_name}**", icon="✅") st.info("We are quite confident in the accuracy of this result.", icon="ℹ️") elif ridge_prediction == predicted_author: st.success(f"Most likely written by: **{cnn_name}**", icon="✅") st.success(f"2nd Most likely written by: **{extra_trees_name}**", icon="✅") st.write("_" * 30) elif extra_trees_prediction == predicted_author: st.success(f"Most likely written by: **{cnn_name}**", icon="✅") st.success(f"2nd Most likely written by: **{ridge_name}**", icon="✅") st.write("_" * 30) else: st.warning("Notice 1: There is a difficulity predicting your text, it might fill into one of the below:", icon="⚠️") st.success(f"1- **{cnn_name}**", icon="✅") st.success(f"2- **{ridge_name}**", icon="✅") st.success(f"3- **{extra_trees_name}**", icon="✅") else: if ridge_prediction == extra_trees_prediction == predicted_author: st.success(f"Most likely written by: **{ridge_name}**", icon="✅") st.info("We are quite confident in the accuracy of this result.", icon="ℹ️") elif ridge_prediction == predicted_author: st.success(f"Most likely written by: **{ridge_name}**", icon="✅") st.success(f"2nd Most likely written by: **{extra_trees_name}**", icon="✅") st.write("_" * 30) elif ridge_prediction == extra_trees_prediction: st.success(f"Most likely written by: **{ridge_name}**", icon="✅") st.success(f"2nd Most likely written by: **{cnn_name}**", icon="✅") st.write("_" * 30) else: st.warning("Notice 1: There is a difficulity predicting your text, it might fill into one of the below:", icon="⚠️") st.success(f"1- **{ridge_name}**", icon="✅") st.success(f"2- **{cnn_name}**", icon="✅") st.success(f"3- **{extra_trees_name}**", icon="✅") # Using expander to make FAQ sections st.subheader("Frequently Asked Questions (FAQ)") # Small Description with st.expander("What is this project about?"): st.write(""" This project is part of an MSc in Data Analytics at the University of Portsmouth. Developed by Jaifar Al Shizawi, it aims to identify whether a text is written by a human or a specific Large Language Model (LLM) like ChatGPT-3, ChatGPT-4, Google Bard, or HuggingChat. For inquiries, contact [up2152209@myport.ac.uk](mailto:up2152209@myport.ac.uk). Supervised by Dr. Mohamed Bader. """) # Aim and Objectives with st.expander("Aim and Objectives"): st.write(""" The project aims to help staff at the University of Portsmouth distinguish between student-written artifacts and those generated by LLMs. It focuses on text feature extraction, model testing, and implementing a user-friendly dashboard among other objectives. """) # System Details with st.expander("How does the system work?"): st.write(""" The system is trained using deep learning model on a dataset of 140,546 paragraphs, varying in length from 10 to 1090 words. It achieves an accuracy of 0.9964 with a validation loss of 0.094. """) # Fetch the image from the URL accuracy_image_request = requests.get("https://jaifar.net/ADS/best_accuracy.png", headers=headers) # Save the downloaded content image_path = "best_accuracy.png" with open(image_path, "wb") as f: f.write(accuracy_image_request.content) # Open the image accuracy_image = Image.open(image_path) # Display the image using streamlit st.image(accuracy_image, caption='Best Accuracy', use_column_width=True) # Data Storage Information with st.expander("Does the system store my data?"): st.write("No, the system does not collect or store any user input data.") # Use-case Limitation with st.expander("Can I use this as evidence?"): st.write(""" No, this system is a Proof of Concept (POC) and should not be used as evidence against students or similar entities. """)