import gradio as gr import pandas as pd import io import tempfile import os from langchain_community.document_loaders import PyPDFLoader import nltk from nltk.tokenize import sent_tokenize # Download NLTK's punkt tokenizer if not already downloaded nltk.download('punkt_tab') # Create a temporary directory for storing download files temp_dir = tempfile.TemporaryDirectory() def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None): """ Extract text from a PDF page by page using LangChain's PyPDFLoader. Args: pdf_file_path (str): The file path to the uploaded PDF. start_page (int, optional): The starting page number for extraction (1-based index). end_page (int, optional): The ending page number for extraction (1-based index). Returns: tuple: - page_df (pd.DataFrame): DataFrame containing Document, Page, and Text. - sentence_df (pd.DataFrame): DataFrame containing Document, Page, and Sentence. """ try: # Initialize the loader loader = PyPDFLoader(pdf_file_path) documents = loader.load_and_split() # Each document corresponds to a single page total_pages = len(documents) doc_name = os.path.basename(pdf_file_path) # Extract document name # Validate and adjust page range if start_page is not None and end_page is not None: # Convert to integers to avoid slicing issues start_page = int(start_page) end_page = int(end_page) # Adjust to valid range if start_page < 1: start_page = 1 if end_page > total_pages: end_page = total_pages if start_page > end_page: start_page, end_page = end_page, start_page # Swap if out of order # Select the subset of documents based on user input selected_docs = documents[start_page - 1:end_page] else: selected_docs = documents start_page = 1 end_page = total_pages # Initialize lists to store data page_data = [] sentence_data = [] for idx, doc in enumerate(selected_docs, start=start_page): page_num = idx text = doc.page_content.strip() # Append page-wise data page_data.append({ "Document": doc_name, "Page": page_num, "Text": text }) # Sentence tokenization sentences = sent_tokenize(text) for sentence in sentences: sentence = sentence.strip() if sentence: sentence_data.append({ "Document": doc_name, "Page": page_num, "Sentence": sentence }) # Create DataFrames page_df = pd.DataFrame(page_data) sentence_df = pd.DataFrame(sentence_data) return page_df, sentence_df except Exception as e: raise RuntimeError(f"Error during PDF extraction: {e}") def df_to_csv_bytes(df): """ Convert DataFrame to CSV in bytes. Args: df (pd.DataFrame): The DataFrame to convert. Returns: bytes: CSV data in bytes. """ try: buffer = io.StringIO() df.to_csv(buffer, index=False) csv_data = buffer.getvalue().encode('utf-8') buffer.close() return csv_data except Exception as e: raise RuntimeError(f"Error during CSV conversion: {e}") def on_extract(pdf_file_path, extraction_mode, start_page, end_page): """ Callback function to extract text from PDF and return CSV data. Args: pdf_file_path (str): The file path to the uploaded PDF. extraction_mode (str): "All Pages" or "Range of Pages". start_page (float): Starting page number for extraction. end_page (float): Ending page number for extraction. Returns: tuple: - page_csv_path (str): Path to the page-wise CSV file. - sentence_csv_path (str): Path to the sentence-wise CSV file. - status_message (str): Status of the extraction process. """ if not pdf_file_path: return None, None, "No file uploaded." try: # Determine page range based on extraction_mode if extraction_mode == "All Pages": selected_start = None selected_end = None else: selected_start = start_page selected_end = end_page # Extract text and create DataFrames page_df, sentence_df = extract_text_with_py_pdf_loader( pdf_file_path, start_page=selected_start, end_page=selected_end ) # Convert DataFrames to CSV bytes page_csv_bytes = df_to_csv_bytes(page_df) sentence_csv_bytes = df_to_csv_bytes(sentence_df) # Define CSV filenames page_csv_filename = f"{os.path.splitext(os.path.basename(pdf_file_path))[0]}_pages.csv" sentence_csv_filename = f"{os.path.splitext(os.path.basename(pdf_file_path))[0]}_sentences.csv" # Define full paths within the temporary directory page_csv_path = os.path.join(temp_dir.name, page_csv_filename) sentence_csv_path = os.path.join(temp_dir.name, sentence_csv_filename) # Write CSV bytes to temporary files with open(page_csv_path, 'wb') as page_csv_file: page_csv_file.write(page_csv_bytes) with open(sentence_csv_path, 'wb') as sentence_csv_file: sentence_csv_file.write(sentence_csv_bytes) # Return the paths to the temporary CSV files and a success message return ( page_csv_path, sentence_csv_path, "Extraction successful!" ) except Exception as e: return None, None, f"Extraction failed: {e}" with gr.Blocks() as demo: gr.Markdown("# 📄 PDF Text Extractor with Multiple Exports") with gr.Row(): pdf_input = gr.File( label="Upload PDF", file_types=[".pdf"], type="filepath", # Ensure type is set to "filepath" interactive=True ) with gr.Row(): extraction_mode = gr.Radio( label="Extraction Mode", choices=["All Pages", "Range of Pages"], value="All Pages", interactive=True ) with gr.Row(): start_page = gr.Number( label="Start Page", value=1, precision=0, interactive=True, visible=False # Initially hidden ) end_page = gr.Number( label="End Page", value=1, precision=0, interactive=True, visible=False # Initially hidden ) # Toggle visibility of start_page and end_page based on extraction_mode extraction_mode.change( fn=lambda mode: ( gr.update(visible=(mode == "Range of Pages")), gr.update(visible=(mode == "Range of Pages")) ), inputs=[extraction_mode], outputs=[start_page, end_page] ) with gr.Row(): extract_button = gr.Button("Extract and Download") with gr.Row(): page_csv_download = gr.File( label="Download Page-wise CSV", interactive=False ) sentence_csv_download = gr.File( label="Download Sentence-wise CSV", interactive=False ) with gr.Row(): status_output = gr.Textbox( label="Status", interactive=False, lines=2 ) extract_button.click( fn=on_extract, inputs=[pdf_input, extraction_mode, start_page, end_page], outputs=[page_csv_download, sentence_csv_download, status_output] ) gr.Markdown(""" --- Developed with ❤️ using Gradio and LangChain. """) # Launch the Gradio app demo.queue().launch()