Spaces:

kritsadaK
/

US_political_fake_news_classification

Sleeping

App Files Files Community

US_political_fake_news_classification / app.py

kritsadaK

update github

5295c00 4 months ago

raw

history blame contribute delete

7.1 kB

	import streamlit as st
	import requests
	from bs4 import BeautifulSoup
	from PIL import Image
	import tensorflow as tf
	from tensorflow.keras.preprocessing.sequence import pad_sequences
	import nltk
	import re
	import string
	import pickle
	import os
	from nltk.corpus import wordnet

	# Set the NLTK data path
	nltk_data_path = 'nltk_data'
	nltk.data.path.append(nltk_data_path)

	# Download 'punkt', 'stopwords', 'wordnet', and 'averaged_perceptron_tagger' if not already present
	if not os.path.exists(os.path.join(nltk_data_path, 'tokenizers/punkt')):
	nltk.download('punkt', download_dir=nltk_data_path)
	if not os.path.exists(os.path.join(nltk_data_path, 'corpora/stopwords')):
	nltk.download('stopwords', download_dir=nltk_data_path)
	if not os.path.exists(os.path.join(nltk_data_path, 'corpora/wordnet')):
	nltk.download('wordnet', download_dir=nltk_data_path)
	if not os.path.exists(os.path.join(nltk_data_path, 'taggers/averaged_perceptron_tagger')):
	nltk.download('averaged_perceptron_tagger', download_dir=nltk_data_path)

	from nltk.tokenize import word_tokenize
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer
	from nltk import pos_tag

	# Function to convert NLTK POS tags to WordNet POS tags
	def get_wordnet_pos(word_tag):
	tag = word_tag[1][0].upper()
	tag_dict = {'J': wordnet.ADJ,
	'N': wordnet.NOUN,
	'V': wordnet.VERB,
	'R': wordnet.ADV}
	return tag_dict.get(tag, wordnet.NOUN)

	# Preprocessing helper functions

	# 1. Remove punctuation
	def remove_punctuation(text):
	text = re.sub(f"[{string.punctuation}]", "", text)
	text = re.sub(r"[^\w\s]", "", text)
	return text

	# 2. Remove stopwords
	stop_words = set(stopwords.words('english'))

	def remove_stopwords(text):
	word_tokens = word_tokenize(text) # This requires 'punkt'
	filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
	return ' '.join(filtered_text)

	# 3. Lemmatization with POS tagging
	lemmatizer = WordNetLemmatizer()

	def lemmatize_text(text):
	word_tokens = word_tokenize(text) # This requires 'punkt'
	pos_tags = pos_tag(word_tokens) # POS tagging
	lemmatized_tokens = [lemmatizer.lemmatize(word.lower(), get_wordnet_pos(tag))
	for word, tag in pos_tags if word.lower() not in stop_words]
	return ' '.join(lemmatized_tokens)

	# 4. Full text preprocessing function
	def preprocess_text(text):
	# Convert to lowercase
	text = text.lower()
	# Remove punctuation
	text = remove_punctuation(text)
	# Remove stopwords
	text = remove_stopwords(text)
	# Lemmatize with POS tagging
	text = lemmatize_text(text)
	# Tokenize and pad the sequence
	sequences = tokenizer.texts_to_sequences([text])
	padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
	return padded_sequences

	# Load pre-trained model
	@st.cache_resource
	def load_model():
	try:
	model = tf.keras.models.load_model('model5.h5')
	st.success("Model loaded successfully!")
	return model
	except Exception as e:
	st.error(f"Error loading model: {e}")
	return None

	model = load_model()

	# Load the saved tokenizer
	@st.cache_resource
	def load_tokenizer():
	try:
	with open('tokenizer.pickle', 'rb') as handle:
	tokenizer = pickle.load(handle)
	st.success("Tokenizer loaded successfully!")
	return tokenizer
	except Exception as e:
	st.error(f"Error loading tokenizer: {e}")
	return None

	tokenizer = load_tokenizer()

	# Set parameters for text preprocessing Ensure this matches what the model was trained with
	MAX_SEQUENCE_LENGTH = 200
	THRESHOLD = 0.7

	# Streamlit interface
	st.title("📰 US Political Fake News Text Detector By using LSTM")
	st.write("Detail and Progression here:")
	st.markdown("[github](https://github.com/yamerooo123/Political-Fake-News-Detector-NLP)")

	# URL input for web scraping
	st.title("🔍 Analyze News from a URL")
	url = st.text_input("Enter the URL of the news article you want to analyze:")

	# Web scraping function to extract text from the URL
	def scrape_text_from_url(url):
	try:
	response = requests.get(url)
	response.raise_for_status() # Check if the request was successful
	soup = BeautifulSoup(response.text, 'html.parser')

	# Remove scripts, styles, and extract the text content
	for script in soup(["script", "style"]):
	script.extract()

	text = soup.get_text(separator="\n").strip()
	return text[:1000] # Limit to first 1000 characters
	except requests.exceptions.RequestException as e:
	return f"Error scraping the URL: {e}"

	# Print raw model output to verify predictions
	def predict_with_threshold(text):
	if model and tokenizer:
	preprocessed_text = preprocess_text(text)

	# Ensure prediction is based on the right model output
	try:
	prediction = model.predict(preprocessed_text)
	raw_prediction = prediction[0][0] # Get raw probability from the model

	# Print the raw prediction to see the actual output
	st.write(f"Raw model prediction: {raw_prediction}")

	fake_prob = raw_prediction * 100 # Scale to percentage

	# Apply threshold and show the result based on the raw prediction
	if raw_prediction > THRESHOLD:
	st.write(f"⚠️ Potential Fake News Probability: {fake_prob:.2f}%")
	st.write("The news article is likely Fake.")
	else:
	st.write(f"⚠️ Potential Fake News Probability: {fake_prob:.2f}%")
	st.write("The news article is likely Real.")
	except Exception as e:
	st.error(f"Error during prediction: {e}")
	else:
	st.error("Model or tokenizer not loaded. Cannot make predictions.")

	# Analyze a news article from the given URL
	if url:
	with st.spinner("Scraping the text..."):
	scraped_text = scrape_text_from_url(url)
	if "Error" in scraped_text:
	st.error(scraped_text)
	else:
	# Display scraped text
	st.subheader("📄 Scraped Text:")
	st.write(scraped_text)

	# Count tokens in the scraped text
	token_count = len(scraped_text.split())
	st.write(f"📝 Word Count: {token_count} words")

	# Preprocess and predict
	predict_with_threshold(scraped_text)

	st.write("Detect whether a given piece of news is fake or real based on its content. Enter a URL to analyze")

	image = Image.open('list.png')
	st.image(image, caption='Source: https://en.wikipedia.org/wiki/List_of_fake_news_websites', use_column_width=True)

	# Display clickable links for fake news examples
	st.title("🔗 Example Fake News Articles")
	st.markdown("[Link 1](https://newsexaminer.net/politics/democratic/trump-democrats-face-different-political-landscape-ahead-of-midterms/)")
	st.markdown("[Link 2](https://newsexaminer.net/robert-f-kennedy-jr-suspends-2024-presidential-campaign-endorses-donald-trump/)")