kritsadaK's picture
fixed
71f42c6
raw
history blame
5.33 kB
import streamlit as st
import requests
from bs4 import BeautifulSoup
from PIL import Image
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
import re
import string
import pickle
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
nltk.data.path.append('/Users/kritsadakruapat/Desktop/Political-Fake-News-Detector-NLP/Fake_News_Detection/nltk_data')
# Preprocessing helper functions
# 1. Remove punctuation
def remove_punctuation(text):
text = re.sub(f"[{string.punctuation}]", "", text)
text = re.sub(r"[^\w\s]", "", text)
return text
# 2. Remove stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
word_tokens = word_tokenize(text)
filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
return ' '.join(filtered_text)
# 3. Lemmatization
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
word_tokens = word_tokenize(text)
lemmatized_tokens = [lemmatizer.lemmatize(word.lower()) for word in word_tokens if word.lower() not in stop_words]
return ' '.join(lemmatized_tokens)
# 4. Full text preprocessing function
def preprocess_text(text):
# Convert to lowercase
text = text.lower()
# Remove punctuation
text = remove_punctuation(text)
# Remove stopwords
text = remove_stopwords(text)
# Lemmatize
text = lemmatize_text(text)
# Tokenize and pad the sequence
sequences = tokenizer.texts_to_sequences([text])
padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
return padded_sequences
# Load pre-trained model
@st.cache_resource
def load_model():
return tf.keras.models.load_model('model5.h5')
model = load_model()
# Load the saved tokenizer
@st.cache_resource
def load_tokenizer():
with open('tokenizer.pickle', 'rb') as handle:
tokenizer = pickle.load(handle)
return tokenizer
tokenizer = load_tokenizer()
# Set parameters for text preprocessing
MAX_SEQUENCE_LENGTH = 200 # Ensure this matches what the model was trained with
THRESHOLD = 0.7 # Adjust threshold as needed
# Streamlit interface
st.title("πŸ“° US Political Fake News Text Detector By using LSTM")
st.write("Detect whether a given piece of news is fake or real based on its content. Enter a URL to analyze its authenticity or test with a sample text.")
image = Image.open('list.png')
st.image(image, caption='Source: https://en.wikipedia.org/wiki/List_of_fake_news_websites', use_column_width=True)
# Display clickable links for fake news examples
st.title("πŸ”— Example Fake News Articles")
st.markdown("[Link 1](https://newsexaminer.net/politics/democratic/trump-democrats-face-different-political-landscape-ahead-of-midterms/)")
st.markdown("[Link 2](https://newsexaminer.net/robert-f-kennedy-jr-suspends-2024-presidential-campaign-endorses-donald-trump/)")
st.markdown("[Link 3](https://newsexaminer.net/trumps-fiery-response-to-harris-dnc-speech-a-social-media-frenzy/)")
# URL input for web scraping
st.title("πŸ” Analyze News from a URL")
url = st.text_input("Enter the URL of the news article you want to analyze:")
# Web scraping function to extract text from the URL
def scrape_text_from_url(url):
try:
response = requests.get(url)
response.raise_for_status() # Check if the request was successful
soup = BeautifulSoup(response.text, 'html.parser')
# Remove scripts, styles, and extract the text content
for script in soup(["script", "style"]):
script.extract()
text = soup.get_text(separator="\n").strip()
return text[:1000] # Limit to first 1000 characters
except requests.exceptions.RequestException as e:
return f"Error scraping the URL: {e}"
# Print raw model output to verify predictions
def predict_with_threshold(text):
preprocessed_text = preprocess_text(text)
# Ensure prediction is based on the right model output
prediction = model.predict(preprocessed_text)
raw_prediction = prediction[0][0] # Get raw probability from the model
# Print the raw prediction to see the actual output
st.write(f"Raw model prediction: {raw_prediction}")
fake_prob = raw_prediction * 100 # Scale to percentage
# Apply threshold and show the result based on the raw prediction
if raw_prediction > THRESHOLD:
st.write(f"⚠️ Potential Fake News Probability: {fake_prob:.2f}%")
st.write("The news article is likely Fake.")
else:
st.write(f"⚠️ Potential Fake News Probability: {fake_prob:.2f}%")
st.write("The news article is likely Real.")
# Analyze a news article from the given URL
if url:
with st.spinner("Scraping the text..."):
scraped_text = scrape_text_from_url(url)
if "Error" in scraped_text:
st.error(scraped_text)
else:
# Display scraped text
st.subheader("πŸ“„ Scraped Text:")
st.write(scraped_text)
# Count tokens in the scraped text
token_count = len(scraped_text.split())
st.write(f"πŸ“ Word Count: {token_count} words")
# Preprocess and predict
predict_with_threshold(scraped_text)