File size: 5,414 Bytes
2e0498e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc41f3f
16c811e
923723e
a881000
2e0498e
 
 
 
 
 
 
 
 
 
 
 
bc41f3f
2e0498e
 
 
 
 
 
 
bc41f3f
2e0498e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a51feb7
2e0498e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import streamlit as st
import requests
from bs4 import BeautifulSoup
from PIL import Image
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
import re
import string
import pickle

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources dynamically if not already present
nltk.download('stopwords')
nltk.download('punkt')  # Download the entire punkt resource including 'punkt_tab'

# Preprocessing helper functions

# 1. Remove punctuation
def remove_punctuation(text):
    text = re.sub(f"[{string.punctuation}]", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    return text

# 2. Remove stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    word_tokens = word_tokenize(text)  # This requires 'punkt'
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(filtered_text)

# 3. Lemmatization
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    word_tokens = word_tokenize(text)  # This requires 'punkt'
    lemmatized_tokens = [lemmatizer.lemmatize(word.lower()) for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(lemmatized_tokens)

# 4. Full text preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = remove_punctuation(text)
    # Remove stopwords
    text = remove_stopwords(text)
    # Lemmatize
    text = lemmatize_text(text)
    # Tokenize and pad the sequence
    sequences = tokenizer.texts_to_sequences([text])
    padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
    return padded_sequences

# Load pre-trained model
@st.cache_resource
def load_model():
    return tf.keras.models.load_model('model5.h5')

model = load_model()

# Load the saved tokenizer
@st.cache_resource
def load_tokenizer():
    with open('tokenizer.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)
    return tokenizer

tokenizer = load_tokenizer()

# Set parameters for text preprocessing
MAX_SEQUENCE_LENGTH = 200  # Ensure this matches what the model was trained with
THRESHOLD = 0.7  # Adjust threshold as needed

# Streamlit interface
st.title("πŸ“° US Political Fake News Text Detector By using LSTM")
st.write("Detect whether a given piece of news is fake or real based on its content. Enter a URL to analyze its authenticity or test with a sample text.")


image = Image.open('list.png')
st.image(image, caption='Source: https://en.wikipedia.org/wiki/List_of_fake_news_websites', use_column_width=True)

# Display clickable links for fake news examples
st.title("πŸ”— Example Fake News Articles")
st.markdown("[Link 1](https://newsexaminer.net/politics/democratic/trump-democrats-face-different-political-landscape-ahead-of-midterms/)")
st.markdown("[Link 2](https://newsexaminer.net/robert-f-kennedy-jr-suspends-2024-presidential-campaign-endorses-donald-trump/)")
st.markdown("[Link 3](https://newsexaminer.net/trumps-fiery-response-to-harris-dnc-speech-a-social-media-frenzy/)")

# URL input for web scraping
st.title("πŸ” Analyze News from a URL")
url = st.text_input("Enter the URL of the news article you want to analyze:")

# Web scraping function to extract text from the URL
def scrape_text_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful
        soup = BeautifulSoup(response.text, 'html.parser')

        # Remove scripts, styles, and extract the text content
        for script in soup(["script", "style"]):
            script.extract()

        text = soup.get_text(separator="\n").strip()
        return text[:1000]  # Limit to first 1000 characters
    except requests.exceptions.RequestException as e:
        return f"Error scraping the URL: {e}"

# Print raw model output to verify predictions
def predict_with_threshold(text):
    preprocessed_text = preprocess_text(text)

    # Ensure prediction is based on the right model output
    prediction = model.predict(preprocessed_text)
    raw_prediction = prediction[0][0]  # Get raw probability from the model

    # Print the raw prediction to see the actual output
    st.write(f"Raw model prediction: {raw_prediction}")

    fake_prob = raw_prediction * 100  # Scale to percentage

    # Apply threshold and show the result based on the raw prediction
    if raw_prediction > THRESHOLD:
        st.write(f"⚠️ Potential Fake News Probability: {fake_prob:.2f}%")
        st.write("The news article is likely Fake.")
    else:
        st.write(f"⚠️ Potential Fake News Probability: {fake_prob:.2f}%")
        st.write("The news article is likely Real.")

# Analyze a news article from the given URL
if url:
    with st.spinner("Scraping the text..."):
        scraped_text = scrape_text_from_url(url)
        if "Error" in scraped_text:
            st.error(scraped_text)
        else:
            # Display scraped text
            st.subheader("πŸ“„ Scraped Text:")
            st.write(scraped_text)

            # Count tokens in the scraped text
            token_count = len(scraped_text.split())
            st.write(f"πŸ“ Word Count: {token_count} words")

            # Preprocess and predict
            predict_with_threshold(scraped_text)