File size: 7,100 Bytes
2e0498e 0e1deac 8fe8be9 2e0498e 5a39070 2f86acd 8fe8be9 2f86acd f51be2e 07f46a5 8fe8be9 07f46a5 5a39070 8fe8be9 db87ccc 5a39070 2e0498e bc41f3f 2e0498e 8fe8be9 2e0498e bc41f3f 8fe8be9 2e0498e 8fe8be9 2e0498e be17ed3 2e0498e be17ed3 2e0498e 0f1ebf0 161f0c5 2e0498e a51feb7 5295c00 8f2b906 2e0498e be17ed3 2e0498e be17ed3 2e0498e 27ffc0e 5295c00 27ffc0e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
import streamlit as st
import requests
from bs4 import BeautifulSoup
from PIL import Image
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
import re
import string
import pickle
import os
from nltk.corpus import wordnet
# Set the NLTK data path
nltk_data_path = 'nltk_data'
nltk.data.path.append(nltk_data_path)
# Download 'punkt', 'stopwords', 'wordnet', and 'averaged_perceptron_tagger' if not already present
if not os.path.exists(os.path.join(nltk_data_path, 'tokenizers/punkt')):
nltk.download('punkt', download_dir=nltk_data_path)
if not os.path.exists(os.path.join(nltk_data_path, 'corpora/stopwords')):
nltk.download('stopwords', download_dir=nltk_data_path)
if not os.path.exists(os.path.join(nltk_data_path, 'corpora/wordnet')):
nltk.download('wordnet', download_dir=nltk_data_path)
if not os.path.exists(os.path.join(nltk_data_path, 'taggers/averaged_perceptron_tagger')):
nltk.download('averaged_perceptron_tagger', download_dir=nltk_data_path)
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
# Function to convert NLTK POS tags to WordNet POS tags
def get_wordnet_pos(word_tag):
tag = word_tag[1][0].upper()
tag_dict = {'J': wordnet.ADJ,
'N': wordnet.NOUN,
'V': wordnet.VERB,
'R': wordnet.ADV}
return tag_dict.get(tag, wordnet.NOUN)
# Preprocessing helper functions
# 1. Remove punctuation
def remove_punctuation(text):
text = re.sub(f"[{string.punctuation}]", "", text)
text = re.sub(r"[^\w\s]", "", text)
return text
# 2. Remove stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
word_tokens = word_tokenize(text) # This requires 'punkt'
filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
return ' '.join(filtered_text)
# 3. Lemmatization with POS tagging
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
word_tokens = word_tokenize(text) # This requires 'punkt'
pos_tags = pos_tag(word_tokens) # POS tagging
lemmatized_tokens = [lemmatizer.lemmatize(word.lower(), get_wordnet_pos(tag))
for word, tag in pos_tags if word.lower() not in stop_words]
return ' '.join(lemmatized_tokens)
# 4. Full text preprocessing function
def preprocess_text(text):
# Convert to lowercase
text = text.lower()
# Remove punctuation
text = remove_punctuation(text)
# Remove stopwords
text = remove_stopwords(text)
# Lemmatize with POS tagging
text = lemmatize_text(text)
# Tokenize and pad the sequence
sequences = tokenizer.texts_to_sequences([text])
padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
return padded_sequences
# Load pre-trained model
@st.cache_resource
def load_model():
try:
model = tf.keras.models.load_model('model5.h5')
st.success("Model loaded successfully!")
return model
except Exception as e:
st.error(f"Error loading model: {e}")
return None
model = load_model()
# Load the saved tokenizer
@st.cache_resource
def load_tokenizer():
try:
with open('tokenizer.pickle', 'rb') as handle:
tokenizer = pickle.load(handle)
st.success("Tokenizer loaded successfully!")
return tokenizer
except Exception as e:
st.error(f"Error loading tokenizer: {e}")
return None
tokenizer = load_tokenizer()
# Set parameters for text preprocessing Ensure this matches what the model was trained with
MAX_SEQUENCE_LENGTH = 200
THRESHOLD = 0.7
# Streamlit interface
st.title("π° US Political Fake News Text Detector By using LSTM")
st.write("Detail and Progression here:")
st.markdown("[github](https://github.com/yamerooo123/Political-Fake-News-Detector-NLP)")
# URL input for web scraping
st.title("π Analyze News from a URL")
url = st.text_input("Enter the URL of the news article you want to analyze:")
# Web scraping function to extract text from the URL
def scrape_text_from_url(url):
try:
response = requests.get(url)
response.raise_for_status() # Check if the request was successful
soup = BeautifulSoup(response.text, 'html.parser')
# Remove scripts, styles, and extract the text content
for script in soup(["script", "style"]):
script.extract()
text = soup.get_text(separator="\n").strip()
return text[:1000] # Limit to first 1000 characters
except requests.exceptions.RequestException as e:
return f"Error scraping the URL: {e}"
# Print raw model output to verify predictions
def predict_with_threshold(text):
if model and tokenizer:
preprocessed_text = preprocess_text(text)
# Ensure prediction is based on the right model output
try:
prediction = model.predict(preprocessed_text)
raw_prediction = prediction[0][0] # Get raw probability from the model
# Print the raw prediction to see the actual output
st.write(f"Raw model prediction: {raw_prediction}")
fake_prob = raw_prediction * 100 # Scale to percentage
# Apply threshold and show the result based on the raw prediction
if raw_prediction > THRESHOLD:
st.write(f"β οΈ Potential Fake News Probability: {fake_prob:.2f}%")
st.write("The news article is likely Fake.")
else:
st.write(f"β οΈ Potential Fake News Probability: {fake_prob:.2f}%")
st.write("The news article is likely Real.")
except Exception as e:
st.error(f"Error during prediction: {e}")
else:
st.error("Model or tokenizer not loaded. Cannot make predictions.")
# Analyze a news article from the given URL
if url:
with st.spinner("Scraping the text..."):
scraped_text = scrape_text_from_url(url)
if "Error" in scraped_text:
st.error(scraped_text)
else:
# Display scraped text
st.subheader("π Scraped Text:")
st.write(scraped_text)
# Count tokens in the scraped text
token_count = len(scraped_text.split())
st.write(f"π Word Count: {token_count} words")
# Preprocess and predict
predict_with_threshold(scraped_text)
st.write("Detect whether a given piece of news is fake or real based on its content. Enter a URL to analyze")
image = Image.open('list.png')
st.image(image, caption='Source: https://en.wikipedia.org/wiki/List_of_fake_news_websites', use_column_width=True)
# Display clickable links for fake news examples
st.title("π Example Fake News Articles")
st.markdown("[Link 1](https://newsexaminer.net/politics/democratic/trump-democrats-face-different-political-landscape-ahead-of-midterms/)")
st.markdown("[Link 2](https://newsexaminer.net/robert-f-kennedy-jr-suspends-2024-presidential-campaign-endorses-donald-trump/)")
|