Spaces:

kritsadaK
/

US_political_fake_news_classification

Sleeping

App Files Files Community

US_political_fake_news_classification / app.py

kritsadaK

fixed

71f42c6 5 months ago

raw

history blame

5.33 kB

	import streamlit as st
	import requests
	from bs4 import BeautifulSoup
	from PIL import Image
	import tensorflow as tf
	from tensorflow.keras.preprocessing.sequence import pad_sequences
	import nltk
	import re
	import string
	import pickle

	from nltk.tokenize import word_tokenize
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer

	import nltk
	nltk.data.path.append('/Users/kritsadakruapat/Desktop/Political-Fake-News-Detector-NLP/Fake_News_Detection/nltk_data')

	# Preprocessing helper functions

	# 1. Remove punctuation
	def remove_punctuation(text):
	text = re.sub(f"[{string.punctuation}]", "", text)
	text = re.sub(r"[^\w\s]", "", text)
	return text

	# 2. Remove stopwords
	stop_words = set(stopwords.words('english'))

	def remove_stopwords(text):
	word_tokens = word_tokenize(text)
	filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
	return ' '.join(filtered_text)

	# 3. Lemmatization
	lemmatizer = WordNetLemmatizer()

	def lemmatize_text(text):
	word_tokens = word_tokenize(text)
	lemmatized_tokens = [lemmatizer.lemmatize(word.lower()) for word in word_tokens if word.lower() not in stop_words]
	return ' '.join(lemmatized_tokens)

	# 4. Full text preprocessing function
	def preprocess_text(text):
	# Convert to lowercase
	text = text.lower()
	# Remove punctuation
	text = remove_punctuation(text)
	# Remove stopwords
	text = remove_stopwords(text)
	# Lemmatize
	text = lemmatize_text(text)
	# Tokenize and pad the sequence
	sequences = tokenizer.texts_to_sequences([text])
	padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
	return padded_sequences

	# Load pre-trained model
	@st.cache_resource
	def load_model():
	return tf.keras.models.load_model('model5.h5')

	model = load_model()

	# Load the saved tokenizer
	@st.cache_resource
	def load_tokenizer():
	with open('tokenizer.pickle', 'rb') as handle:
	tokenizer = pickle.load(handle)
	return tokenizer

	tokenizer = load_tokenizer()

	# Set parameters for text preprocessing
	MAX_SEQUENCE_LENGTH = 200 # Ensure this matches what the model was trained with
	THRESHOLD = 0.7 # Adjust threshold as needed

	# Streamlit interface
	st.title("📰 US Political Fake News Text Detector By using LSTM")
	st.write("Detect whether a given piece of news is fake or real based on its content. Enter a URL to analyze its authenticity or test with a sample text.")


	image = Image.open('list.png')
	st.image(image, caption='Source: https://en.wikipedia.org/wiki/List_of_fake_news_websites', use_column_width=True)

	# Display clickable links for fake news examples
	st.title("🔗 Example Fake News Articles")
	st.markdown("[Link 1](https://newsexaminer.net/politics/democratic/trump-democrats-face-different-political-landscape-ahead-of-midterms/)")
	st.markdown("[Link 2](https://newsexaminer.net/robert-f-kennedy-jr-suspends-2024-presidential-campaign-endorses-donald-trump/)")
	st.markdown("[Link 3](https://newsexaminer.net/trumps-fiery-response-to-harris-dnc-speech-a-social-media-frenzy/)")

	# URL input for web scraping
	st.title("🔍 Analyze News from a URL")
	url = st.text_input("Enter the URL of the news article you want to analyze:")

	# Web scraping function to extract text from the URL
	def scrape_text_from_url(url):
	try:
	response = requests.get(url)
	response.raise_for_status() # Check if the request was successful
	soup = BeautifulSoup(response.text, 'html.parser')

	# Remove scripts, styles, and extract the text content
	for script in soup(["script", "style"]):
	script.extract()

	text = soup.get_text(separator="\n").strip()
	return text[:1000] # Limit to first 1000 characters
	except requests.exceptions.RequestException as e:
	return f"Error scraping the URL: {e}"

	# Print raw model output to verify predictions
	def predict_with_threshold(text):
	preprocessed_text = preprocess_text(text)

	# Ensure prediction is based on the right model output
	prediction = model.predict(preprocessed_text)
	raw_prediction = prediction[0][0] # Get raw probability from the model

	# Print the raw prediction to see the actual output
	st.write(f"Raw model prediction: {raw_prediction}")

	fake_prob = raw_prediction * 100 # Scale to percentage

	# Apply threshold and show the result based on the raw prediction
	if raw_prediction > THRESHOLD:
	st.write(f"⚠️ Potential Fake News Probability: {fake_prob:.2f}%")
	st.write("The news article is likely Fake.")
	else:
	st.write(f"⚠️ Potential Fake News Probability: {fake_prob:.2f}%")
	st.write("The news article is likely Real.")


	# Analyze a news article from the given URL
	if url:
	with st.spinner("Scraping the text..."):
	scraped_text = scrape_text_from_url(url)
	if "Error" in scraped_text:
	st.error(scraped_text)
	else:
	# Display scraped text
	st.subheader("📄 Scraped Text:")
	st.write(scraped_text)

	# Count tokens in the scraped text
	token_count = len(scraped_text.split())
	st.write(f"📝 Word Count: {token_count} words")

	# Preprocess and predict
	predict_with_threshold(scraped_text)