import streamlit as st
import requests
from bs4 import BeautifulSoup
from PIL import Image
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
import re
import string
import pickle
import os

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Ensure nltk_data is correctly set to local folder
nltk_data_path = 'nltk_data'
nltk.data.path.append(nltk_data_path)

# Check if 'punkt' is already downloaded
if not os.path.exists(os.path.join(nltk_data_path, 'tokenizers/punkt')):
    # Force download of punkt to the local 'nltk_data' directory
    nltk.download('punkt', download_dir=nltk_data_path)
# Preprocessing helper functions

# 1. Remove punctuation
def remove_punctuation(text):
    text = re.sub(f"[{string.punctuation}]", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    return text

# 2. Remove stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    word_tokens = word_tokenize(text)  # This requires 'punkt'
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(filtered_text)

# 3. Lemmatization
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    word_tokens = word_tokenize(text)  # This requires 'punkt'
    lemmatized_tokens = [lemmatizer.lemmatize(word.lower()) for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(lemmatized_tokens)

# 4. Full text preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = remove_punctuation(text)
    # Remove stopwords
    text = remove_stopwords(text)
    # Lemmatize
    text = lemmatize_text(text)
    # Tokenize and pad the sequence
    sequences = tokenizer.texts_to_sequences([text])
    padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
    return padded_sequences

# Function to download model dynamically from Google Drive
def download_model(url, filename):
    if not os.path.exists(filename):  # Check if the file is already downloaded
        response = requests.get(url, stream=True)
        with open(filename, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        print(f"{filename} downloaded successfully.")
    else:
        print(f"{filename} already exists, skipping download.")

# Download the model file (replace the URL with your Google Drive link)
model_url = "https://drive.google.com/uc?export=download&id=1oKcJzQnYgYjlzdmII09GNeFGWdIhXwuF"
model_filename = "model5.h5"
download_model(model_url, model_filename)

# Load pre-trained model
@st.cache_resource
def load_model():
    return tf.keras.models.load_model(model_filename)

model = load_model()

# Load the saved tokenizer
@st.cache_resource
def load_tokenizer():
    with open('tokenizer.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)
    return tokenizer

tokenizer = load_tokenizer()

# Set parameters for text preprocessing
MAX_SEQUENCE_LENGTH = 200  # Ensure this matches what the model was trained with
THRESHOLD = 0.7  # Adjust threshold as needed

# Streamlit interface
st.title("📰 US Political Fake News Text Detector By using LSTM")
st.write("Detect whether a given piece of news is fake or real based on its content. Enter a URL to analyze its authenticity or test with a sample text.")


image = Image.open('list.png')
st.image(image, caption='Source: https://en.wikipedia.org/wiki/List_of_fake_news_websites', use_column_width=True)

# Display clickable links for fake news examples
st.title("🔗 Example Fake News Articles")
st.markdown("[Link 1](https://newsexaminer.net/politics/democratic/trump-democrats-face-different-political-landscape-ahead-of-midterms/)")
st.markdown("[Link 2](https://newsexaminer.net/robert-f-kennedy-jr-suspends-2024-presidential-campaign-endorses-donald-trump/)")
st.markdown("[Link 3](https://newsexaminer.net/trumps-fiery-response-to-harris-dnc-speech-a-social-media-frenzy/)")

# URL input for web scraping
st.title("🔍 Analyze News from a URL")
url = st.text_input("Enter the URL of the news article you want to analyze:")

# Web scraping function to extract text from the URL
def scrape_text_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful
        soup = BeautifulSoup(response.text, 'html.parser')

        # Remove scripts, styles, and extract the text content
        for script in soup(["script", "style"]):
            script.extract()

        text = soup.get_text(separator="\n").strip()
        return text[:1000]  # Limit to first 1000 characters
    except requests.exceptions.RequestException as e:
        return f"Error scraping the URL: {e}"

# Print raw model output to verify predictions
def predict_with_threshold(text):
    preprocessed_text = preprocess_text(text)

    # Ensure prediction is based on the right model output
    prediction = model.predict(preprocessed_text)
    raw_prediction = prediction[0][0]  # Get raw probability from the model

    # Print the raw prediction to see the actual output
    st.write(f"Raw model prediction: {raw_prediction}")

    fake_prob = raw_prediction * 100  # Scale to percentage

    # Apply threshold and show the result based on the raw prediction
    if raw_prediction > THRESHOLD:
        st.write(f"⚠️ Potential Fake News Probability: {fake_prob:.2f}%")
        st.write("The news article is likely Fake.")
    else:
        st.write(f"⚠️ Potential Fake News Probability: {fake_prob:.2f}%")
        st.write("The news article is likely Real.")

# Analyze a news article from the given URL
if url:
    with st.spinner("Scraping the text..."):
        scraped_text = scrape_text_from_url(url)
        if "Error" in scraped_text:
            st.error(scraped_text)
        else:
            # Display scraped text
            st.subheader("📄 Scraped Text:")
            st.write(scraped_text)

            # Count tokens in the scraped text
            token_count = len(scraped_text.split())
            st.write(f"📝 Word Count: {token_count} words")

            # Preprocess and predict
            predict_with_threshold(scraped_text)