kritsadaK's picture
update github
5295c00
raw
history blame
7.1 kB
import streamlit as st
import requests
from bs4 import BeautifulSoup
from PIL import Image
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
import re
import string
import pickle
import os
from nltk.corpus import wordnet
# Set the NLTK data path
nltk_data_path = 'nltk_data'
nltk.data.path.append(nltk_data_path)
# Download 'punkt', 'stopwords', 'wordnet', and 'averaged_perceptron_tagger' if not already present
if not os.path.exists(os.path.join(nltk_data_path, 'tokenizers/punkt')):
nltk.download('punkt', download_dir=nltk_data_path)
if not os.path.exists(os.path.join(nltk_data_path, 'corpora/stopwords')):
nltk.download('stopwords', download_dir=nltk_data_path)
if not os.path.exists(os.path.join(nltk_data_path, 'corpora/wordnet')):
nltk.download('wordnet', download_dir=nltk_data_path)
if not os.path.exists(os.path.join(nltk_data_path, 'taggers/averaged_perceptron_tagger')):
nltk.download('averaged_perceptron_tagger', download_dir=nltk_data_path)
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
# Function to convert NLTK POS tags to WordNet POS tags
def get_wordnet_pos(word_tag):
tag = word_tag[1][0].upper()
tag_dict = {'J': wordnet.ADJ,
'N': wordnet.NOUN,
'V': wordnet.VERB,
'R': wordnet.ADV}
return tag_dict.get(tag, wordnet.NOUN)
# Preprocessing helper functions
# 1. Remove punctuation
def remove_punctuation(text):
text = re.sub(f"[{string.punctuation}]", "", text)
text = re.sub(r"[^\w\s]", "", text)
return text
# 2. Remove stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
word_tokens = word_tokenize(text) # This requires 'punkt'
filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
return ' '.join(filtered_text)
# 3. Lemmatization with POS tagging
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
word_tokens = word_tokenize(text) # This requires 'punkt'
pos_tags = pos_tag(word_tokens) # POS tagging
lemmatized_tokens = [lemmatizer.lemmatize(word.lower(), get_wordnet_pos(tag))
for word, tag in pos_tags if word.lower() not in stop_words]
return ' '.join(lemmatized_tokens)
# 4. Full text preprocessing function
def preprocess_text(text):
# Convert to lowercase
text = text.lower()
# Remove punctuation
text = remove_punctuation(text)
# Remove stopwords
text = remove_stopwords(text)
# Lemmatize with POS tagging
text = lemmatize_text(text)
# Tokenize and pad the sequence
sequences = tokenizer.texts_to_sequences([text])
padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
return padded_sequences
# Load pre-trained model
@st.cache_resource
def load_model():
try:
model = tf.keras.models.load_model('model5.h5')
st.success("Model loaded successfully!")
return model
except Exception as e:
st.error(f"Error loading model: {e}")
return None
model = load_model()
# Load the saved tokenizer
@st.cache_resource
def load_tokenizer():
try:
with open('tokenizer.pickle', 'rb') as handle:
tokenizer = pickle.load(handle)
st.success("Tokenizer loaded successfully!")
return tokenizer
except Exception as e:
st.error(f"Error loading tokenizer: {e}")
return None
tokenizer = load_tokenizer()
# Set parameters for text preprocessing Ensure this matches what the model was trained with
MAX_SEQUENCE_LENGTH = 200
THRESHOLD = 0.7
# Streamlit interface
st.title("πŸ“° US Political Fake News Text Detector By using LSTM")
st.write("Detail and Progression here:")
st.markdown("[github](https://github.com/yamerooo123/Political-Fake-News-Detector-NLP)")
# URL input for web scraping
st.title("πŸ” Analyze News from a URL")
url = st.text_input("Enter the URL of the news article you want to analyze:")
# Web scraping function to extract text from the URL
def scrape_text_from_url(url):
try:
response = requests.get(url)
response.raise_for_status() # Check if the request was successful
soup = BeautifulSoup(response.text, 'html.parser')
# Remove scripts, styles, and extract the text content
for script in soup(["script", "style"]):
script.extract()
text = soup.get_text(separator="\n").strip()
return text[:1000] # Limit to first 1000 characters
except requests.exceptions.RequestException as e:
return f"Error scraping the URL: {e}"
# Print raw model output to verify predictions
def predict_with_threshold(text):
if model and tokenizer:
preprocessed_text = preprocess_text(text)
# Ensure prediction is based on the right model output
try:
prediction = model.predict(preprocessed_text)
raw_prediction = prediction[0][0] # Get raw probability from the model
# Print the raw prediction to see the actual output
st.write(f"Raw model prediction: {raw_prediction}")
fake_prob = raw_prediction * 100 # Scale to percentage
# Apply threshold and show the result based on the raw prediction
if raw_prediction > THRESHOLD:
st.write(f"⚠️ Potential Fake News Probability: {fake_prob:.2f}%")
st.write("The news article is likely Fake.")
else:
st.write(f"⚠️ Potential Fake News Probability: {fake_prob:.2f}%")
st.write("The news article is likely Real.")
except Exception as e:
st.error(f"Error during prediction: {e}")
else:
st.error("Model or tokenizer not loaded. Cannot make predictions.")
# Analyze a news article from the given URL
if url:
with st.spinner("Scraping the text..."):
scraped_text = scrape_text_from_url(url)
if "Error" in scraped_text:
st.error(scraped_text)
else:
# Display scraped text
st.subheader("πŸ“„ Scraped Text:")
st.write(scraped_text)
# Count tokens in the scraped text
token_count = len(scraped_text.split())
st.write(f"πŸ“ Word Count: {token_count} words")
# Preprocess and predict
predict_with_threshold(scraped_text)
st.write("Detect whether a given piece of news is fake or real based on its content. Enter a URL to analyze")
image = Image.open('list.png')
st.image(image, caption='Source: https://en.wikipedia.org/wiki/List_of_fake_news_websites', use_column_width=True)
# Display clickable links for fake news examples
st.title("πŸ”— Example Fake News Articles")
st.markdown("[Link 1](https://newsexaminer.net/politics/democratic/trump-democrats-face-different-political-landscape-ahead-of-midterms/)")
st.markdown("[Link 2](https://newsexaminer.net/robert-f-kennedy-jr-suspends-2024-presidential-campaign-endorses-donald-trump/)")