import gradio as gr
import pandas as pd
import io
import tempfile
import os
from langchain_community.document_loaders import PyPDFLoader
import nltk
from nltk.tokenize import sent_tokenize

# Download NLTK's punkt tokenizer if not already downloaded
nltk.download('punkt_tab')

# Create a temporary directory for storing download files
temp_dir = tempfile.TemporaryDirectory()

def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None):
    """
    Extract text from a PDF page by page using LangChain's PyPDFLoader.

    Args:
        pdf_file_path (str): The file path to the uploaded PDF.
        start_page (int, optional): The starting page number for extraction (1-based index).
        end_page (int, optional): The ending page number for extraction (1-based index).

    Returns:
        tuple: 
            - page_df (pd.DataFrame): DataFrame containing Document, Page, and Text.
            - sentence_df (pd.DataFrame): DataFrame containing Document, Page, and Sentence.
    """
    try:
        # Initialize the loader
        loader = PyPDFLoader(pdf_file_path)
        documents = loader.load_and_split()  # Each document corresponds to a single page

        total_pages = len(documents)
        doc_name = os.path.basename(pdf_file_path)  # Extract document name

        # Validate and adjust page range
        if start_page is not None and end_page is not None:
            # Convert to integers to avoid slicing issues
            start_page = int(start_page)
            end_page = int(end_page)

            # Adjust to valid range
            if start_page < 1:
                start_page = 1
            if end_page > total_pages:
                end_page = total_pages
            if start_page > end_page:
                start_page, end_page = end_page, start_page  # Swap if out of order

            # Select the subset of documents based on user input
            selected_docs = documents[start_page - 1:end_page]
        else:
            selected_docs = documents
            start_page = 1
            end_page = total_pages

        # Initialize lists to store data
        page_data = []
        sentence_data = []

        for idx, doc in enumerate(selected_docs, start=start_page):
            page_num = idx
            text = doc.page_content.strip()

            # Append page-wise data
            page_data.append({
                "Document": doc_name,
                "Page": page_num,
                "Text": text
            })

            # Sentence tokenization
            sentences = sent_tokenize(text)
            for sentence in sentences:
                sentence = sentence.strip()
                if sentence:
                    sentence_data.append({
                        "Document": doc_name,
                        "Page": page_num,
                        "Sentence": sentence
                    })

        # Create DataFrames
        page_df = pd.DataFrame(page_data)
        sentence_df = pd.DataFrame(sentence_data)

        return page_df, sentence_df

    except Exception as e:
        raise RuntimeError(f"Error during PDF extraction: {e}")

def df_to_csv_bytes(df):
    """
    Convert DataFrame to CSV in bytes.

    Args:
        df (pd.DataFrame): The DataFrame to convert.

    Returns:
        bytes: CSV data in bytes.
    """
    try:
        buffer = io.StringIO()
        df.to_csv(buffer, index=False)
        csv_data = buffer.getvalue().encode('utf-8')
        buffer.close()
        return csv_data
    except Exception as e:
        raise RuntimeError(f"Error during CSV conversion: {e}")

def on_extract(pdf_file_path, extraction_mode, start_page, end_page):
    """
    Callback function to extract text from PDF and return CSV data.

    Args:
        pdf_file_path (str): The file path to the uploaded PDF.
        extraction_mode (str): "All Pages" or "Range of Pages".
        start_page (float): Starting page number for extraction.
        end_page (float): Ending page number for extraction.

    Returns:
        tuple: 
            - page_csv_path (str): Path to the page-wise CSV file.
            - sentence_csv_path (str): Path to the sentence-wise CSV file.
            - status_message (str): Status of the extraction process.
    """
    if not pdf_file_path:
        return None, None, "No file uploaded."

    try:
        # Determine page range based on extraction_mode
        if extraction_mode == "All Pages":
            selected_start = None
            selected_end = None
        else:
            selected_start = start_page
            selected_end = end_page

        # Extract text and create DataFrames
        page_df, sentence_df = extract_text_with_py_pdf_loader(
            pdf_file_path,
            start_page=selected_start,
            end_page=selected_end
        )
        
        # Convert DataFrames to CSV bytes
        page_csv_bytes = df_to_csv_bytes(page_df)
        sentence_csv_bytes = df_to_csv_bytes(sentence_df)
        
        # Define CSV filenames
        page_csv_filename = f"{os.path.splitext(os.path.basename(pdf_file_path))[0]}_pages.csv"
        sentence_csv_filename = f"{os.path.splitext(os.path.basename(pdf_file_path))[0]}_sentences.csv"
        
        # Define full paths within the temporary directory
        page_csv_path = os.path.join(temp_dir.name, page_csv_filename)
        sentence_csv_path = os.path.join(temp_dir.name, sentence_csv_filename)
        
        # Write CSV bytes to temporary files
        with open(page_csv_path, 'wb') as page_csv_file:
            page_csv_file.write(page_csv_bytes)
        
        with open(sentence_csv_path, 'wb') as sentence_csv_file:
            sentence_csv_file.write(sentence_csv_bytes)
        
        # Return the paths to the temporary CSV files and a success message
        return (
            page_csv_path,
            sentence_csv_path,
            "Extraction successful!"
        )
    except Exception as e:
        return None, None, f"Extraction failed: {e}"

with gr.Blocks() as demo:
    gr.Markdown("# 📄 PDF Text Extractor with Multiple Exports")

    with gr.Row():
        pdf_input = gr.File(
            label="Upload PDF",
            file_types=[".pdf"],
            type="filepath",  # Ensure type is set to "filepath"
            interactive=True
        )

    with gr.Row():
        extraction_mode = gr.Radio(
            label="Extraction Mode",
            choices=["All Pages", "Range of Pages"],
            value="All Pages",
            interactive=True
        )

    with gr.Row():
        start_page = gr.Number(
            label="Start Page",
            value=1,
            precision=0,
            interactive=True,
            visible=False  # Initially hidden
        )
        end_page = gr.Number(
            label="End Page",
            value=1,
            precision=0,
            interactive=True,
            visible=False  # Initially hidden
        )

    # Toggle visibility of start_page and end_page based on extraction_mode
    extraction_mode.change(
        fn=lambda mode: (
            gr.update(visible=(mode == "Range of Pages")),
            gr.update(visible=(mode == "Range of Pages"))
        ),
        inputs=[extraction_mode],
        outputs=[start_page, end_page]
    )

    with gr.Row():
        extract_button = gr.Button("Extract and Download")

    with gr.Row():
        page_csv_download = gr.File(
            label="Download Page-wise CSV",
            interactive=False
        )
        sentence_csv_download = gr.File(
            label="Download Sentence-wise CSV",
            interactive=False
        )

    with gr.Row():
        status_output = gr.Textbox(
            label="Status",
            interactive=False,
            lines=2
        )

    extract_button.click(
        fn=on_extract,
        inputs=[pdf_input, extraction_mode, start_page, end_page],
        outputs=[page_csv_download, sentence_csv_download, status_output]
    )

    gr.Markdown("""
    ---
    Developed with ❤️ using Gradio and LangChain.
    """)

# Launch the Gradio app
demo.queue().launch()