File size: 7,100 Bytes
2e0498e
 
 
 
 
 
 
 
 
 
0e1deac
8fe8be9
2e0498e
5a39070
2f86acd
 
 
8fe8be9
2f86acd
f51be2e
07f46a5
 
8fe8be9
 
 
 
07f46a5
 
 
5a39070
8fe8be9
 
 
 
 
 
 
 
 
 
db87ccc
5a39070
2e0498e
 
 
 
 
 
 
 
 
 
 
bc41f3f
2e0498e
 
 
8fe8be9
2e0498e
 
 
bc41f3f
8fe8be9
 
 
2e0498e
 
 
 
 
 
 
 
 
 
8fe8be9
2e0498e
 
 
 
 
 
 
 
 
be17ed3
 
 
 
 
 
 
2e0498e
 
 
 
 
 
be17ed3
 
 
 
 
 
 
 
2e0498e
 
 
0f1ebf0
161f0c5
 
2e0498e
 
a51feb7
5295c00
8f2b906
2e0498e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
be17ed3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2e0498e
be17ed3
2e0498e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27ffc0e
5295c00
27ffc0e
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import streamlit as st
import requests
from bs4 import BeautifulSoup
from PIL import Image
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
import re
import string
import pickle
import os
from nltk.corpus import wordnet

# Set the NLTK data path
nltk_data_path = 'nltk_data'
nltk.data.path.append(nltk_data_path)

# Download 'punkt', 'stopwords', 'wordnet', and 'averaged_perceptron_tagger' if not already present
if not os.path.exists(os.path.join(nltk_data_path, 'tokenizers/punkt')):
    nltk.download('punkt', download_dir=nltk_data_path)
if not os.path.exists(os.path.join(nltk_data_path, 'corpora/stopwords')):
    nltk.download('stopwords', download_dir=nltk_data_path)
if not os.path.exists(os.path.join(nltk_data_path, 'corpora/wordnet')):
    nltk.download('wordnet', download_dir=nltk_data_path)
if not os.path.exists(os.path.join(nltk_data_path, 'taggers/averaged_perceptron_tagger')):
    nltk.download('averaged_perceptron_tagger', download_dir=nltk_data_path)

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

# Function to convert NLTK POS tags to WordNet POS tags
def get_wordnet_pos(word_tag):
    tag = word_tag[1][0].upper()
    tag_dict = {'J': wordnet.ADJ,
                'N': wordnet.NOUN,
                'V': wordnet.VERB,
                'R': wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

# Preprocessing helper functions

# 1. Remove punctuation
def remove_punctuation(text):
    text = re.sub(f"[{string.punctuation}]", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    return text

# 2. Remove stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    word_tokens = word_tokenize(text)  # This requires 'punkt'
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(filtered_text)

# 3. Lemmatization with POS tagging
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    word_tokens = word_tokenize(text)  # This requires 'punkt'
    pos_tags = pos_tag(word_tokens)  # POS tagging
    lemmatized_tokens = [lemmatizer.lemmatize(word.lower(), get_wordnet_pos(tag))
                         for word, tag in pos_tags if word.lower() not in stop_words]
    return ' '.join(lemmatized_tokens)

# 4. Full text preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = remove_punctuation(text)
    # Remove stopwords
    text = remove_stopwords(text)
    # Lemmatize with POS tagging
    text = lemmatize_text(text)
    # Tokenize and pad the sequence
    sequences = tokenizer.texts_to_sequences([text])
    padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
    return padded_sequences

# Load pre-trained model
@st.cache_resource
def load_model():
    try:
        model = tf.keras.models.load_model('model5.h5')
        st.success("Model loaded successfully!")
        return model
    except Exception as e:
        st.error(f"Error loading model: {e}")
        return None

model = load_model()

# Load the saved tokenizer
@st.cache_resource
def load_tokenizer():
    try:
        with open('tokenizer.pickle', 'rb') as handle:
            tokenizer = pickle.load(handle)
        st.success("Tokenizer loaded successfully!")
        return tokenizer
    except Exception as e:
        st.error(f"Error loading tokenizer: {e}")
        return None

tokenizer = load_tokenizer()

# Set parameters for text preprocessing Ensure this matches what the model was trained with
MAX_SEQUENCE_LENGTH = 200
THRESHOLD = 0.7

# Streamlit interface
st.title("πŸ“° US Political Fake News Text Detector By using LSTM")
st.write("Detail and Progression here:")
st.markdown("[github](https://github.com/yamerooo123/Political-Fake-News-Detector-NLP)")

# URL input for web scraping
st.title("πŸ” Analyze News from a URL")
url = st.text_input("Enter the URL of the news article you want to analyze:")

# Web scraping function to extract text from the URL
def scrape_text_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful
        soup = BeautifulSoup(response.text, 'html.parser')

        # Remove scripts, styles, and extract the text content
        for script in soup(["script", "style"]):
            script.extract()

        text = soup.get_text(separator="\n").strip()
        return text[:1000]  # Limit to first 1000 characters
    except requests.exceptions.RequestException as e:
        return f"Error scraping the URL: {e}"

# Print raw model output to verify predictions
def predict_with_threshold(text):
    if model and tokenizer:
        preprocessed_text = preprocess_text(text)

        # Ensure prediction is based on the right model output
        try:
            prediction = model.predict(preprocessed_text)
            raw_prediction = prediction[0][0]  # Get raw probability from the model

            # Print the raw prediction to see the actual output
            st.write(f"Raw model prediction: {raw_prediction}")

            fake_prob = raw_prediction * 100  # Scale to percentage

            # Apply threshold and show the result based on the raw prediction
            if raw_prediction > THRESHOLD:
                st.write(f"⚠️ Potential Fake News Probability: {fake_prob:.2f}%")
                st.write("The news article is likely Fake.")
            else:
                st.write(f"⚠️ Potential Fake News Probability: {fake_prob:.2f}%")
                st.write("The news article is likely Real.")
        except Exception as e:
            st.error(f"Error during prediction: {e}")
    else:
        st.error("Model or tokenizer not loaded. Cannot make predictions.")

# Analyze a news article from the given URL
if url:
    with st.spinner("Scraping the text..."):
        scraped_text = scrape_text_from_url(url)
        if "Error" in scraped_text:
            st.error(scraped_text)
        else:
            # Display scraped text
            st.subheader("πŸ“„ Scraped Text:")
            st.write(scraped_text)

            # Count tokens in the scraped text
            token_count = len(scraped_text.split())
            st.write(f"πŸ“ Word Count: {token_count} words")

            # Preprocess and predict
            predict_with_threshold(scraped_text)

st.write("Detect whether a given piece of news is fake or real based on its content. Enter a URL to analyze")

image = Image.open('list.png')
st.image(image, caption='Source: https://en.wikipedia.org/wiki/List_of_fake_news_websites', use_column_width=True)

# Display clickable links for fake news examples
st.title("πŸ”— Example Fake News Articles")
st.markdown("[Link 1](https://newsexaminer.net/politics/democratic/trump-democrats-face-different-political-landscape-ahead-of-midterms/)")
st.markdown("[Link 2](https://newsexaminer.net/robert-f-kennedy-jr-suspends-2024-presidential-campaign-endorses-donald-trump/)")