Spaces:

RJuro
/

pdf-digest

Running

App Files Files Community

RJuro commited on 18 days ago

Commit

b339ee0

1 Parent(s): 8d2ddbc

Plain TTS Flow

Browse files

Files changed (6) hide show

.gitignore +2 -1
app.py +117 -2
prompts/plain_TTS_QA.prompt +25 -36
prompts/plain_TTS_draft.prompt +33 -17
prompts/plain_TTS_outline.prompt +45 -48
utils/__pycache__/review_flow.cpython-311.pyc +0 -0

.gitignore CHANGED Viewed

@@ -1,4 +1,5 @@
 .env
 dev/
 promp_tmp/
-.aiderignore

 .env
 dev/
 promp_tmp/
+.aiderignore
+google_credentials.json

app.py CHANGED Viewed

@@ -137,7 +137,7 @@ with st.expander("How It Works"):
     )
 # --- Mode Selection ---
-mode = st.sidebar.radio("Choose a mode:", options=["Explore One Publication", "Write a Literature Review"])
 if mode == "Explore One Publication":
     st.subheader("Single-Publication Analysis 📄")
@@ -419,4 +419,119 @@ elif mode == "Write a Literature Review":
                         mime="audio/mp3"
                     )
                 except Exception as e:
-                    st.error("Podcast generation failed: " + str(e))

     )
 # --- Mode Selection ---
+mode = st.sidebar.radio("Choose a mode:", options=["Explore One Publication", "Write a Literature Review", "Generate TTS Readout"])
 if mode == "Explore One Publication":
     st.subheader("Single-Publication Analysis 📄")
                         mime="audio/mp3"
                     )
                 except Exception as e:
+                    st.error("Podcast generation failed: " + str(e))
+elif mode == "Generate TTS Readout":
+    st.subheader("Generate Simple TTS Readout")
+    uploaded_pdf = st.file_uploader("Upload a PDF", type=["pdf"])
+    # Mapping with nicer descriptors: Name, gender, and country flag emoji
+    voice_options = {
+        "Heart (Female) 🇺🇸": "af_heart",
+        "Bella (Female) 🇺🇸": "af_bella",
+        "Michael (Male) 🇺🇸": "am_michael",
+        "Puck (Male) 🇺🇸": "am_puck",
+        "Emma (Female) 🇬🇧": "bf_emma",
+        "George (Male) 🇬🇧": "bm_george"
+    }
+    selected_voice = st.selectbox("Select Voice", options=list(voice_options.keys()))
+    voice_choice = voice_options[selected_voice]
+    # Flag to store intermediate outputs to disk (set to True by default)
+    store_intermediates = False
+    if uploaded_pdf is not None:
+        if uploaded_pdf.size < 5000:
+            st.error("Input does not appear to be a valid academic paper.")
+            st.stop()
+        st.session_state["uploaded_pdf_tts"] = uploaded_pdf
+        st.success("PDF uploaded successfully. 👍")
+        # Load generation models
+        title_model_name, title_generation_config = get_generation_model("flash")
+        default_model_name, default_generation_config = get_generation_model("thinking")
+        progress_bar = st.progress(0)
+        async def process_tts_readout():
+            temp_dir = setup_temp_directories()
+            try:
+                pdf_basename = os.path.splitext(uploaded_pdf.name)[0]
+                st.session_state["pdf_basename_tts"] = pdf_basename
+                temp_pdf_path = os.path.join(temp_dir, "uploaded.pdf")
+                with open(temp_pdf_path, "wb") as f:
+                    f.write(uploaded_pdf.getbuffer())
+                progress_bar.progress(10)
+                # Upload PDF to Gemini and wait for processing
+                pdf_file = upload_to_gemini(temp_pdf_path, mime_type="application/pdf")
+                wait_for_files_active([pdf_file])
+                progress_bar.progress(20)
+                # Validate the academic paper via title/reference check
+                with st.spinner("Validating academic paper..."):
+                    title_ref = await generate_title_reference_and_classification(
+                        pdf_file, title_model_name, title_generation_config
+                    )
+                    if title_ref.error:
+                        st.error("Uploaded PDF is not a valid academic paper: " + title_ref.error)
+                        st.stop()
+                progress_bar.progress(30)
+                # Step 1: Generate TTS Outline
+                with st.spinner("Generating TTS Outline..."):
+                    plain_tts_outline_prompt = load_prompt("prompts/plain_TTS_outline.prompt")
+                    outline_output = await async_generate_text(
+                        plain_tts_outline_prompt, pdf_file,
+                        model_name=default_model_name,
+                        generation_config=default_generation_config
+                    )
+                progress_bar.progress(50)
+                if store_intermediates:
+                    save_intermediate_output(outline_output, pdf_basename, "tts_outline")
+                # Step 2: Generate TTS Draft using the outline
+                with st.spinner("Generating TTS Draft..."):
+                    plain_tts_draft_prompt = load_prompt("prompts/plain_TTS_draft.prompt")
+                    combined_draft_prompt = outline_output + "\n\n" + plain_tts_draft_prompt
+                    draft_output = await async_generate_text(
+                        combined_draft_prompt, pdf_file,
+                        model_name=default_model_name,
+                        generation_config=default_generation_config
+                    )
+                progress_bar.progress(70)
+                if store_intermediates:
+                    save_intermediate_output(draft_output, pdf_basename, "tts_draft")
+                # Step 3: Finalize the readout via Q&A
+                with st.spinner("Finalizing TTS Readout..."):
+                    plain_tts_qa_prompt = load_prompt("prompts/plain_TTS_QA.prompt")
+                    combined_qa_prompt = draft_output + "\n\n" + plain_tts_qa_prompt
+                    final_output = await async_generate_text(
+                        combined_qa_prompt, pdf_file,
+                        model_name=default_model_name,
+                        generation_config=default_generation_config
+                    )
+                progress_bar.progress(90)
+                if store_intermediates:
+                    save_intermediate_output(final_output, pdf_basename, "tts_final")
+                # Generate audio using the selected voice
+                with st.spinner("Generating audio..."):
+                    audio_mp3_data = generate_tts_audio(final_output, voice=voice_choice, speed=1.0)
+                progress_bar.progress(100)
+                st.audio(audio_mp3_data, format="audio/mp3")
+                st.download_button(
+                    label="Download TTS Audio",
+                    data=audio_mp3_data,
+                    file_name=f"{pdf_basename}_tts_audio.mp3",
+                    mime="audio/mp3"
+                )
+            except Exception as e:
+                st.error("Error during TTS readout generation: " + str(e))
+            finally:
+                cleanup_temp_files(temp_dir)
+        if st.button("Generate TTS Readout Audio"):
+            asyncio.run(process_tts_readout())

prompts/plain_TTS_QA.prompt CHANGED Viewed

@@ -1,46 +1,35 @@
-You are a quality assurance specialist for academic Text-to-Speech (TTS) content. Your task is to produce a final, production-ready TTS version that meets all quality standards while maintaining the integrity of the original content.
 Input:
 1. Original academic paper (complete text)
-2. TTS-adapted version of the paper
 Quality Assurance Process:
-1. Content Verification and Reorganization
-   - Remove all references sections at the end of the document
-   - Remove all table listings at the end of the document
-   - Consolidate any split sections (e.g., "Discussion (continued)") into single, continuous sections
-   - Maintain the logical flow of the original while ensuring a continuous reading experience
-   - Verify no critical arguments or findings were omitted
-2. TTS-Specific Optimization
-   - Replace all non-TTS-friendly characters:
-     • Convert all < to "less than" and > to "greater than"
-     • Convert special characters (β, α, μ, Δ, etc.) to spoken form
-     • Spell out equations in verbal form
-     • Remove all formatting characters (*, #, _, etc.)
-   - Convert all parenthetical citations to spoken format:
-     • Example: "(Smith et al., 2020)" → "as Smith and colleagues demonstrated in 2020"
-     • Use varied phrasing for natural flow
-   - Ensure acronyms are properly expanded at first use
-   - Adapt table/figure references for audio context
-3. Output Format Requirements
-   - Provide a single, continuous document without section splits
-   - Use clear section headings as ## [Section Title]
-   - DO NOT include:
-     • References section
-     • Tables section at the end
-     • Figures section at the end
-     • Acknowledgements section
-     • Author information
-   - Ensure transitions between merged sections flow naturally
 4. Final Deliverable
-   - You MUST provide the complete, corrected text as your output
-   - Do not just describe what needs to be fixed
-   - Do not summarize your changes
-   - The entire paper should be presented in its final, TTS-ready form
-   - The document should be fully ready for direct input into a TTS system
-Important: Your output should be ONLY the final, production-ready TTS version with all corrections applied. Do not include analysis, summaries of changes, or section-by-section QA reviews in your output. The text should flow as a continuous, complete document optimized for audio consumption.

+You are a quality assurance specialist for academic Text-to-Speech (TTS) content. Your task is to review and finalize a draft TTS adaptation, focusing especially on handling tables correctly and removing unnecessary elements.
 Input:
 1. Original academic paper (complete text)
+2. TTS-adapted draft version of the paper
 Quality Assurance Process:
+1. Table Content Handling
+   - Convert all tabular information into narrative paragraphs
+   - For comparative tables, create flowing text that highlights key comparisons
+   - Use phrases like "Comparing the studies..." or "The research shows several patterns across..."
+   - Ensure all valuable data from tables is preserved in spoken form
+2. Content Verification
+   - Remove the references section completely
+   - Remove any partial or complete table listings
+   - Ensure no section headings contain special characters or formatting
+   - Verify all temperatures are properly written out (e.g., "negative 80 degrees Celsius")
+   - Check that all acronyms are properly expanded at first use
+   - Confirm all parenthetical citations are converted to natural spoken format
+3. Audio-Friendly Formatting
+   - Replace ALL special characters with spoken equivalents
+   - Ensure section headings are in plain text format with no special characters
+   - Verify transitions between sections flow naturally
+   - Check that all content is presented in complete sentences
+   - Confirm no markup, formatting codes, or non-verbal elements remain
 4. Final Deliverable
+   - Provide the complete, corrected text as your output
+   - The document must contain ZERO special characters, hashtags, asterisks, or formatting codes
+   - Every element must be in a form that can be read aloud naturally
+   - The entire paper should be presented as continuous, flowing text
+Important: Your output should be ONLY the final, production-ready TTS version with all corrections applied. Do not include explanations of your changes or QA notes. The text should flow as a continuous, complete document that a TTS system can read without encountering any non-verbal elements or requiring human interpretation.

prompts/plain_TTS_draft.prompt CHANGED Viewed

@@ -1,47 +1,63 @@
 You are an expert in creating TTS-friendly versions of academic papers. You're now in PHASE 2 - EXECUTION, where you'll transform an academic paper into audio-optimized content following the conversion plan created in Phase 1.
 PHASE 2 - EXECUTION (CURRENT TASK):
-Using both the original academic paper and the JSON conversion plan, create a TTS-optimized version while preserving the original language wherever possible.
 Instructions for Phase 2:
 1. Input requirements:
    - The original academic paper text
-   - The JSON conversion plan from Phase 1
-2. For each section in the conversion_plan:
-   - Locate the section using the content_markers (start/end text)
-   - Apply only the necessary modifications specified in tts_conversion_instructions
    - Preserve original wording wherever it doesn't hinder audio comprehension
-3. Make minimal, targeted transformations limited to:
    - Converting parenthetical citations into spoken form
      • Example: "(Smith et al., 2020)" → "as Smith and colleagues demonstrated in 2020"
-   - Adapting table/figure references for audio context
-     • Only modify how tables/figures are referenced, not the surrounding analysis
    - Spelling out symbols, equations and non-standard characters
      • Example: "p<0.05" → "p less than 0.05"
    - Adding minimal transition words between sections only when necessary for audio flow
-4. DO NOT modify:
    - Technical terminology or field-specific vocabulary
-   - Sentence structure unless absolutely necessary for audio comprehension
    - The author's original arguments, assertions, or conclusions
    - Any content that already works well in spoken form
-5. Output format:
    - Create a single comprehensive document with clearly marked sections
    - Each section should follow this structure:
      ```
-     ## [Section Title]
-     [Minimally-modified content for TTS]
      ```
-6. Final output should:
-   - Remain as close to verbatim as possible
    - Only modify elements that specifically hinder TTS delivery
    - Maintain the exact same information, tone, and academic level
    - Be indistinguishable from the original in terms of content and meaning
-Note: The principle of minimal intervention should guide all transformations. Only modify text when necessary for audio clarity. The goal is a verbatim conversion with just enough adaptation to work in audio format.

 You are an expert in creating TTS-friendly versions of academic papers. You're now in PHASE 2 - EXECUTION, where you'll transform an academic paper into audio-optimized content following the conversion plan created in Phase 1.
 PHASE 2 - EXECUTION (CURRENT TASK):
+Using the original academic paper, create a TTS-optimized version while preserving the original language wherever possible.
 Instructions for Phase 2:
 1. Input requirements:
    - The original academic paper text
+   - The outline with a plan for the transformation.
+2. For each section in the paper:
+   - Apply only necessary modifications for audio clarity
    - Preserve original wording wherever it doesn't hinder audio comprehension
+   - Maintain the logical flow and academic integrity of the content
+3. Make targeted transformations limited to:
    - Converting parenthetical citations into spoken form
      • Example: "(Smith et al., 2020)" → "as Smith and colleagues demonstrated in 2020"
+     • Only include the first citation when multiple citations appear for the same point
    - Spelling out symbols, equations and non-standard characters
      • Example: "p<0.05" → "p less than 0.05"
    - Adding minimal transition words between sections only when necessary for audio flow
+4. Table transformation requirements (CRITICAL):
+   - ALL tables must be completely transformed into narrative paragraphs
+   - Do not preserve ANY tabular structure, column headings, or row formats
+   - For comparative tables showing multiple studies/methods:
+     • Begin with a transition phrase like "Comparing the key studies in this review..."
+     • Organize information by meaningful patterns (chronological, methodological similarities, or finding categories)
+     • Highlight comparative elements: "While Study A found X, Study B demonstrated Y"
+     • Ensure all critical data points are preserved in the narrative
+   - For data tables:
+     • Convert into descriptive paragraphs that present the patterns and relationships
+     • Use natural language to describe trends, comparisons, and outliers
+     • Maintain the analytical insights from the original table
+5. Special handling for other complex elements:
+   - For references section: Omit the final reference list entirely as it's not suitable for audio
+   - For figures/visuals: Briefly describe what they would show, then focus on the insights they provide
+6. DO NOT modify:
    - Technical terminology or field-specific vocabulary
    - The author's original arguments, assertions, or conclusions
    - Any content that already works well in spoken form
+7. Output format:
    - Create a single comprehensive document with clearly marked sections
    - Each section should follow this structure:
      ```
+     Section Title
+     [Audio-optimized content]
      ```
+   - Do not include any formatting characters such as #, *, _, or other markdown symbols
+8. Final output should:
+   - Remain as close to verbatim as possible in non-tabular content
    - Only modify elements that specifically hinder TTS delivery
    - Maintain the exact same information, tone, and academic level
    - Be indistinguishable from the original in terms of content and meaning
+   - Contain absolutely no special characters, formatting codes, or tabular structures
+Note: The principle of minimal intervention should guide all transformations. Your goal is to create an audio-friendly version that maintains the scholarly integrity of the original while enabling smooth TTS delivery. The entire paper should be presented in its final, TTS-ready form with no special characters that cannot be read aloud.

prompts/plain_TTS_outline.prompt CHANGED Viewed

@@ -4,63 +4,60 @@ PHASE 1 - ANALYSIS AND PLANNING (CURRENT TASK):
 Analyze the provided academic paper and create a structured JSON plan for its conversion to TTS format. This plan will serve as instructions for the actual conversion in Phase 2.
 Instructions for Phase 1:
 Identify and map the paper's structure:
-Create a logical outline of all major sections and subsections
-Note where figures, tables, equations, and citations appear
-Identify sections to exclude (references, acknowledgments, etc.)
 Output a JSON planning document with this structure:
 {
-"paper_metadata": {
-"title": "Title of the paper",
-"authors": ["Author 1 Name", "Author 2 Name", ...],
-"publication_details": "e.g., Journal Name, Year (if available)"
-},
-"conversion_plan": [
-{
-"section_id": "unique_identifier",
-"section_title": "Section Title",
-"section_type": "abstract|introduction|methodology|results|discussion|conclusion|etc",
-"content_markers": {
-"start": "First 5-7 words of section...",
-"end": "...last 5-7 words of section"
-},
-"tts_conversion_instructions": [
-"Specific instruction for handling this section",
-"Handle X citations in paragraph 2",
-"Narrate Table Y findings",
-"Simplify equation discussion in paragraph Z"
-],
-"special_elements": [
-{
-"element_type": "citation|table|figure|equation",
-"location": "Paragraph number or descriptive location",
-"handling_strategy": "How this element should be converted"
-}
-]
-}
-],
-"global_conversion_guidelines": [
-"General principle 1 for the entire document",
-"General principle 2 for the entire document"
-]
 }
 For each section, provide clear content_markers using the first and last few words to help locate the section boundaries.
 For tts_conversion_instructions, be specific about:
-How to handle citations (e.g., "(Smith et al., 2019)" → "as Smith and colleagues found in 2019")
-How to narrate tables/figures (focus on interpreting findings rather than describing visuals)
-How to simplify complex terminology or equations
-How to improve flow between paragraphs or concepts
 PHASE 2 - EXECUTION (FUTURE TASK):
 In the next step, these instructions will be used to transform the actual content into TTS-friendly text, following all the specific guidelines provided in Phase 1.

 Analyze the provided academic paper and create a structured JSON plan for its conversion to TTS format. This plan will serve as instructions for the actual conversion in Phase 2.
 Instructions for Phase 1:
 Identify and map the paper's structure:
+- Create a logical outline of all major sections and subsections
+- Note where figures, tables, equations, and citations appear
+- Identify sections to exclude (references, acknowledgments, etc.)
 Output a JSON planning document with this structure:
+```
 {
+  "paper_metadata": {
+    "title": "Title of the paper",
+    "authors": ["Author 1 Name", "Author 2 Name", ...],
+    "publication_details": "e.g., Journal Name, Year (if available)"
+  },
+  "conversion_plan": [
+    {
+      "section_id": "unique_identifier",
+      "section_title": "Section Title",
+      "section_type": "abstract|introduction|methodology|results|discussion|conclusion|etc",
+      "content_markers": {
+        "start": "First 5-7 words of section...",
+        "end": "...last 5-7 words of section"
+      },
+      "tts_conversion_instructions": [
+        "Specific instruction for handling this section",
+        "Handle X citations in paragraph 2",
+        "Narrate Table Y findings",
+        "Simplify equation discussion in paragraph Z"
+      ],
+      "special_elements": [
+        {
+          "element_type": "citation|table|figure|equation",
+          "location": "Paragraph number or descriptive location",
+          "handling_strategy": "How this element should be converted"
+        }
+      ]
+    }
+  ],
+  "global_conversion_guidelines": [
+    "General principle 1 for the entire document",
+    "General principle 2 for the entire document"
+  ]
 }
+```
 For each section, provide clear content_markers using the first and last few words to help locate the section boundaries.
 For tts_conversion_instructions, be specific about:
+- How to handle citations (e.g., "(Smith et al., 2019)" → "as Smith and colleagues found in 2019")
+- How to handle figures and tables:
+  * For figures: Include a brief 1-2 sentence description using pattern "The paper presents Figure X, which shows [key visual element]." Then focus on the insights: "This figure illustrates that..."
+  * For simple tables: Summarize in a short paragraph using pattern "The paper now presents Table Y - a comparison of [key elements]. The main findings show that..."
+  * For complex tables (e.g., regression results): Use pattern "Table Z presents regression results that demonstrate [1-2 key conclusions]" without reading individual values
+- For mathematical formulas: Note their presence without reading them verbatim: "The paper includes a mathematical expression for [concept]" then explain the conclusion or implication
+- How to improve flow between paragraphs or concepts
 PHASE 2 - EXECUTION (FUTURE TASK):
 In the next step, these instructions will be used to transform the actual content into TTS-friendly text, following all the specific guidelines provided in Phase 1.

utils/__pycache__/review_flow.cpython-311.pyc CHANGED Viewed

Binary files a/utils/__pycache__/review_flow.cpython-311.pyc and b/utils/__pycache__/review_flow.cpython-311.pyc differ