File size: 5,414 Bytes
2e0498e bc41f3f 16c811e 923723e a881000 2e0498e bc41f3f 2e0498e bc41f3f 2e0498e a51feb7 2e0498e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
import streamlit as st
import requests
from bs4 import BeautifulSoup
from PIL import Image
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
import re
import string
import pickle
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Download NLTK resources dynamically if not already present
nltk.download('stopwords')
nltk.download('punkt') # Download the entire punkt resource including 'punkt_tab'
# Preprocessing helper functions
# 1. Remove punctuation
def remove_punctuation(text):
text = re.sub(f"[{string.punctuation}]", "", text)
text = re.sub(r"[^\w\s]", "", text)
return text
# 2. Remove stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
word_tokens = word_tokenize(text) # This requires 'punkt'
filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
return ' '.join(filtered_text)
# 3. Lemmatization
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
word_tokens = word_tokenize(text) # This requires 'punkt'
lemmatized_tokens = [lemmatizer.lemmatize(word.lower()) for word in word_tokens if word.lower() not in stop_words]
return ' '.join(lemmatized_tokens)
# 4. Full text preprocessing function
def preprocess_text(text):
# Convert to lowercase
text = text.lower()
# Remove punctuation
text = remove_punctuation(text)
# Remove stopwords
text = remove_stopwords(text)
# Lemmatize
text = lemmatize_text(text)
# Tokenize and pad the sequence
sequences = tokenizer.texts_to_sequences([text])
padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
return padded_sequences
# Load pre-trained model
@st.cache_resource
def load_model():
return tf.keras.models.load_model('model5.h5')
model = load_model()
# Load the saved tokenizer
@st.cache_resource
def load_tokenizer():
with open('tokenizer.pickle', 'rb') as handle:
tokenizer = pickle.load(handle)
return tokenizer
tokenizer = load_tokenizer()
# Set parameters for text preprocessing
MAX_SEQUENCE_LENGTH = 200 # Ensure this matches what the model was trained with
THRESHOLD = 0.7 # Adjust threshold as needed
# Streamlit interface
st.title("π° US Political Fake News Text Detector By using LSTM")
st.write("Detect whether a given piece of news is fake or real based on its content. Enter a URL to analyze its authenticity or test with a sample text.")
image = Image.open('list.png')
st.image(image, caption='Source: https://en.wikipedia.org/wiki/List_of_fake_news_websites', use_column_width=True)
# Display clickable links for fake news examples
st.title("π Example Fake News Articles")
st.markdown("[Link 1](https://newsexaminer.net/politics/democratic/trump-democrats-face-different-political-landscape-ahead-of-midterms/)")
st.markdown("[Link 2](https://newsexaminer.net/robert-f-kennedy-jr-suspends-2024-presidential-campaign-endorses-donald-trump/)")
st.markdown("[Link 3](https://newsexaminer.net/trumps-fiery-response-to-harris-dnc-speech-a-social-media-frenzy/)")
# URL input for web scraping
st.title("π Analyze News from a URL")
url = st.text_input("Enter the URL of the news article you want to analyze:")
# Web scraping function to extract text from the URL
def scrape_text_from_url(url):
try:
response = requests.get(url)
response.raise_for_status() # Check if the request was successful
soup = BeautifulSoup(response.text, 'html.parser')
# Remove scripts, styles, and extract the text content
for script in soup(["script", "style"]):
script.extract()
text = soup.get_text(separator="\n").strip()
return text[:1000] # Limit to first 1000 characters
except requests.exceptions.RequestException as e:
return f"Error scraping the URL: {e}"
# Print raw model output to verify predictions
def predict_with_threshold(text):
preprocessed_text = preprocess_text(text)
# Ensure prediction is based on the right model output
prediction = model.predict(preprocessed_text)
raw_prediction = prediction[0][0] # Get raw probability from the model
# Print the raw prediction to see the actual output
st.write(f"Raw model prediction: {raw_prediction}")
fake_prob = raw_prediction * 100 # Scale to percentage
# Apply threshold and show the result based on the raw prediction
if raw_prediction > THRESHOLD:
st.write(f"β οΈ Potential Fake News Probability: {fake_prob:.2f}%")
st.write("The news article is likely Fake.")
else:
st.write(f"β οΈ Potential Fake News Probability: {fake_prob:.2f}%")
st.write("The news article is likely Real.")
# Analyze a news article from the given URL
if url:
with st.spinner("Scraping the text..."):
scraped_text = scrape_text_from_url(url)
if "Error" in scraped_text:
st.error(scraped_text)
else:
# Display scraped text
st.subheader("π Scraped Text:")
st.write(scraped_text)
# Count tokens in the scraped text
token_count = len(scraped_text.split())
st.write(f"π Word Count: {token_count} words")
# Preprocess and predict
predict_with_threshold(scraped_text)
|