|
import streamlit as st |
|
import requests |
|
from bs4 import BeautifulSoup |
|
from PIL import Image |
|
import tensorflow as tf |
|
from tensorflow.keras.preprocessing.sequence import pad_sequences |
|
import nltk |
|
import re |
|
import string |
|
import pickle |
|
import os |
|
from nltk.corpus import wordnet |
|
|
|
|
|
nltk_data_path = 'nltk_data' |
|
nltk.data.path.append(nltk_data_path) |
|
|
|
|
|
if not os.path.exists(os.path.join(nltk_data_path, 'tokenizers/punkt')): |
|
nltk.download('punkt', download_dir=nltk_data_path) |
|
if not os.path.exists(os.path.join(nltk_data_path, 'corpora/stopwords')): |
|
nltk.download('stopwords', download_dir=nltk_data_path) |
|
if not os.path.exists(os.path.join(nltk_data_path, 'corpora/wordnet')): |
|
nltk.download('wordnet', download_dir=nltk_data_path) |
|
if not os.path.exists(os.path.join(nltk_data_path, 'taggers/averaged_perceptron_tagger')): |
|
nltk.download('averaged_perceptron_tagger', download_dir=nltk_data_path) |
|
|
|
from nltk.tokenize import word_tokenize |
|
from nltk.corpus import stopwords |
|
from nltk.stem import WordNetLemmatizer |
|
from nltk import pos_tag |
|
|
|
|
|
def get_wordnet_pos(word_tag): |
|
tag = word_tag[1][0].upper() |
|
tag_dict = {'J': wordnet.ADJ, |
|
'N': wordnet.NOUN, |
|
'V': wordnet.VERB, |
|
'R': wordnet.ADV} |
|
return tag_dict.get(tag, wordnet.NOUN) |
|
|
|
|
|
|
|
|
|
def remove_punctuation(text): |
|
text = re.sub(f"[{string.punctuation}]", "", text) |
|
text = re.sub(r"[^\w\s]", "", text) |
|
return text |
|
|
|
|
|
stop_words = set(stopwords.words('english')) |
|
|
|
def remove_stopwords(text): |
|
word_tokens = word_tokenize(text) |
|
filtered_text = [word for word in word_tokens if word.lower() not in stop_words] |
|
return ' '.join(filtered_text) |
|
|
|
|
|
lemmatizer = WordNetLemmatizer() |
|
|
|
def lemmatize_text(text): |
|
word_tokens = word_tokenize(text) |
|
pos_tags = pos_tag(word_tokens) |
|
lemmatized_tokens = [lemmatizer.lemmatize(word.lower(), get_wordnet_pos(tag)) |
|
for word, tag in pos_tags if word.lower() not in stop_words] |
|
return ' '.join(lemmatized_tokens) |
|
|
|
|
|
def preprocess_text(text): |
|
|
|
text = text.lower() |
|
|
|
text = remove_punctuation(text) |
|
|
|
text = remove_stopwords(text) |
|
|
|
text = lemmatize_text(text) |
|
|
|
sequences = tokenizer.texts_to_sequences([text]) |
|
padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post') |
|
return padded_sequences |
|
|
|
|
|
@st.cache_resource |
|
def load_model(): |
|
try: |
|
model = tf.keras.models.load_model('model5.h5') |
|
st.success("Model loaded successfully!") |
|
return model |
|
except Exception as e: |
|
st.error(f"Error loading model: {e}") |
|
return None |
|
|
|
model = load_model() |
|
|
|
|
|
@st.cache_resource |
|
def load_tokenizer(): |
|
try: |
|
with open('tokenizer.pickle', 'rb') as handle: |
|
tokenizer = pickle.load(handle) |
|
st.success("Tokenizer loaded successfully!") |
|
return tokenizer |
|
except Exception as e: |
|
st.error(f"Error loading tokenizer: {e}") |
|
return None |
|
|
|
tokenizer = load_tokenizer() |
|
|
|
|
|
MAX_SEQUENCE_LENGTH = 200 |
|
THRESHOLD = 0.7 |
|
|
|
|
|
st.title("π° US Political Fake News Text Detector By using LSTM") |
|
st.write("Detail and Progression here:") |
|
st.markdown("[github](https://github.com/yamerooo123/Political-Fake-News-Detector-NLP)") |
|
|
|
|
|
st.title("π Analyze News from a URL") |
|
url = st.text_input("Enter the URL of the news article you want to analyze:") |
|
|
|
|
|
def scrape_text_from_url(url): |
|
try: |
|
response = requests.get(url) |
|
response.raise_for_status() |
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
|
|
|
|
for script in soup(["script", "style"]): |
|
script.extract() |
|
|
|
text = soup.get_text(separator="\n").strip() |
|
return text[:1000] |
|
except requests.exceptions.RequestException as e: |
|
return f"Error scraping the URL: {e}" |
|
|
|
|
|
def predict_with_threshold(text): |
|
if model and tokenizer: |
|
preprocessed_text = preprocess_text(text) |
|
|
|
|
|
try: |
|
prediction = model.predict(preprocessed_text) |
|
raw_prediction = prediction[0][0] |
|
|
|
|
|
st.write(f"Raw model prediction: {raw_prediction}") |
|
|
|
fake_prob = raw_prediction * 100 |
|
|
|
|
|
if raw_prediction > THRESHOLD: |
|
st.write(f"β οΈ Potential Fake News Probability: {fake_prob:.2f}%") |
|
st.write("The news article is likely Fake.") |
|
else: |
|
st.write(f"β οΈ Potential Fake News Probability: {fake_prob:.2f}%") |
|
st.write("The news article is likely Real.") |
|
except Exception as e: |
|
st.error(f"Error during prediction: {e}") |
|
else: |
|
st.error("Model or tokenizer not loaded. Cannot make predictions.") |
|
|
|
|
|
if url: |
|
with st.spinner("Scraping the text..."): |
|
scraped_text = scrape_text_from_url(url) |
|
if "Error" in scraped_text: |
|
st.error(scraped_text) |
|
else: |
|
|
|
st.subheader("π Scraped Text:") |
|
st.write(scraped_text) |
|
|
|
|
|
token_count = len(scraped_text.split()) |
|
st.write(f"π Word Count: {token_count} words") |
|
|
|
|
|
predict_with_threshold(scraped_text) |
|
|
|
st.write("Detect whether a given piece of news is fake or real based on its content. Enter a URL to analyze") |
|
|
|
image = Image.open('list.png') |
|
st.image(image, caption='Source: https://en.wikipedia.org/wiki/List_of_fake_news_websites', use_column_width=True) |
|
|
|
|
|
st.title("π Example Fake News Articles") |
|
st.markdown("[Link 1](https://newsexaminer.net/politics/democratic/trump-democrats-face-different-political-landscape-ahead-of-midterms/)") |
|
st.markdown("[Link 2](https://newsexaminer.net/robert-f-kennedy-jr-suspends-2024-presidential-campaign-endorses-donald-trump/)") |
|
|