import streamlit as st import requests from bs4 import BeautifulSoup from PIL import Image import tensorflow as tf from tensorflow.keras.preprocessing.sequence import pad_sequences import nltk import re import string import pickle import os from nltk.corpus import wordnet # Set the NLTK data path nltk_data_path = 'nltk_data' nltk.data.path.append(nltk_data_path) # Download 'punkt', 'stopwords', 'wordnet', and 'averaged_perceptron_tagger' if not already present if not os.path.exists(os.path.join(nltk_data_path, 'tokenizers/punkt')): nltk.download('punkt', download_dir=nltk_data_path) if not os.path.exists(os.path.join(nltk_data_path, 'corpora/stopwords')): nltk.download('stopwords', download_dir=nltk_data_path) if not os.path.exists(os.path.join(nltk_data_path, 'corpora/wordnet')): nltk.download('wordnet', download_dir=nltk_data_path) if not os.path.exists(os.path.join(nltk_data_path, 'taggers/averaged_perceptron_tagger')): nltk.download('averaged_perceptron_tagger', download_dir=nltk_data_path) from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from nltk import pos_tag # Function to convert NLTK POS tags to WordNet POS tags def get_wordnet_pos(word_tag): tag = word_tag[1][0].upper() tag_dict = {'J': wordnet.ADJ, 'N': wordnet.NOUN, 'V': wordnet.VERB, 'R': wordnet.ADV} return tag_dict.get(tag, wordnet.NOUN) # Preprocessing helper functions # 1. Remove punctuation def remove_punctuation(text): text = re.sub(f"[{string.punctuation}]", "", text) text = re.sub(r"[^\w\s]", "", text) return text # 2. Remove stopwords stop_words = set(stopwords.words('english')) def remove_stopwords(text): word_tokens = word_tokenize(text) # This requires 'punkt' filtered_text = [word for word in word_tokens if word.lower() not in stop_words] return ' '.join(filtered_text) # 3. Lemmatization with POS tagging lemmatizer = WordNetLemmatizer() def lemmatize_text(text): word_tokens = word_tokenize(text) # This requires 'punkt' pos_tags = pos_tag(word_tokens) # POS tagging lemmatized_tokens = [lemmatizer.lemmatize(word.lower(), get_wordnet_pos(tag)) for word, tag in pos_tags if word.lower() not in stop_words] return ' '.join(lemmatized_tokens) # 4. Full text preprocessing function def preprocess_text(text): # Convert to lowercase text = text.lower() # Remove punctuation text = remove_punctuation(text) # Remove stopwords text = remove_stopwords(text) # Lemmatize with POS tagging text = lemmatize_text(text) # Tokenize and pad the sequence sequences = tokenizer.texts_to_sequences([text]) padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post') return padded_sequences # Load pre-trained model @st.cache_resource def load_model(): try: model = tf.keras.models.load_model('model5.h5') st.success("Model loaded successfully!") return model except Exception as e: st.error(f"Error loading model: {e}") return None model = load_model() # Load the saved tokenizer @st.cache_resource def load_tokenizer(): try: with open('tokenizer.pickle', 'rb') as handle: tokenizer = pickle.load(handle) st.success("Tokenizer loaded successfully!") return tokenizer except Exception as e: st.error(f"Error loading tokenizer: {e}") return None tokenizer = load_tokenizer() # Set parameters for text preprocessing Ensure this matches what the model was trained with MAX_SEQUENCE_LENGTH = 200 THRESHOLD = 0.7 # Streamlit interface st.title("📰 US Political Fake News Text Detector By using LSTM") st.write("Detail and Progression here:") st.markdown("[github](https://github.com/yamerooo123/Political-Fake-News-Detector-NLP)") # URL input for web scraping st.title("🔍 Analyze News from a URL") url = st.text_input("Enter the URL of the news article you want to analyze:") # Web scraping function to extract text from the URL def scrape_text_from_url(url): try: response = requests.get(url) response.raise_for_status() # Check if the request was successful soup = BeautifulSoup(response.text, 'html.parser') # Remove scripts, styles, and extract the text content for script in soup(["script", "style"]): script.extract() text = soup.get_text(separator="\n").strip() return text[:1000] # Limit to first 1000 characters except requests.exceptions.RequestException as e: return f"Error scraping the URL: {e}" # Print raw model output to verify predictions def predict_with_threshold(text): if model and tokenizer: preprocessed_text = preprocess_text(text) # Ensure prediction is based on the right model output try: prediction = model.predict(preprocessed_text) raw_prediction = prediction[0][0] # Get raw probability from the model # Print the raw prediction to see the actual output st.write(f"Raw model prediction: {raw_prediction}") fake_prob = raw_prediction * 100 # Scale to percentage # Apply threshold and show the result based on the raw prediction if raw_prediction > THRESHOLD: st.write(f"⚠️ Potential Fake News Probability: {fake_prob:.2f}%") st.write("The news article is likely Fake.") else: st.write(f"⚠️ Potential Fake News Probability: {fake_prob:.2f}%") st.write("The news article is likely Real.") except Exception as e: st.error(f"Error during prediction: {e}") else: st.error("Model or tokenizer not loaded. Cannot make predictions.") # Analyze a news article from the given URL if url: with st.spinner("Scraping the text..."): scraped_text = scrape_text_from_url(url) if "Error" in scraped_text: st.error(scraped_text) else: # Display scraped text st.subheader("📄 Scraped Text:") st.write(scraped_text) # Count tokens in the scraped text token_count = len(scraped_text.split()) st.write(f"📝 Word Count: {token_count} words") # Preprocess and predict predict_with_threshold(scraped_text) st.write("Detect whether a given piece of news is fake or real based on its content. Enter a URL to analyze") image = Image.open('list.png') st.image(image, caption='Source: https://en.wikipedia.org/wiki/List_of_fake_news_websites', use_column_width=True) # Display clickable links for fake news examples st.title("🔗 Example Fake News Articles") st.markdown("[Link 1](https://newsexaminer.net/politics/democratic/trump-democrats-face-different-political-landscape-ahead-of-midterms/)") st.markdown("[Link 2](https://newsexaminer.net/robert-f-kennedy-jr-suspends-2024-presidential-campaign-endorses-donald-trump/)")