RJuro commited on
Commit
b339ee0
Β·
1 Parent(s): 8d2ddbc

Plain TTS Flow

Browse files
.gitignore CHANGED
@@ -1,4 +1,5 @@
1
  .env
2
  dev/
3
  promp_tmp/
4
- .aiderignore
 
 
1
  .env
2
  dev/
3
  promp_tmp/
4
+ .aiderignore
5
+ google_credentials.json
app.py CHANGED
@@ -137,7 +137,7 @@ with st.expander("How It Works"):
137
  )
138
 
139
  # --- Mode Selection ---
140
- mode = st.sidebar.radio("Choose a mode:", options=["Explore One Publication", "Write a Literature Review"])
141
 
142
  if mode == "Explore One Publication":
143
  st.subheader("Single-Publication Analysis πŸ“„")
@@ -419,4 +419,119 @@ elif mode == "Write a Literature Review":
419
  mime="audio/mp3"
420
  )
421
  except Exception as e:
422
- st.error("Podcast generation failed: " + str(e))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  )
138
 
139
  # --- Mode Selection ---
140
+ mode = st.sidebar.radio("Choose a mode:", options=["Explore One Publication", "Write a Literature Review", "Generate TTS Readout"])
141
 
142
  if mode == "Explore One Publication":
143
  st.subheader("Single-Publication Analysis πŸ“„")
 
419
  mime="audio/mp3"
420
  )
421
  except Exception as e:
422
+ st.error("Podcast generation failed: " + str(e))
423
+
424
+ elif mode == "Generate TTS Readout":
425
+ st.subheader("Generate Simple TTS Readout")
426
+ uploaded_pdf = st.file_uploader("Upload a PDF", type=["pdf"])
427
+
428
+ # Mapping with nicer descriptors: Name, gender, and country flag emoji
429
+ voice_options = {
430
+ "Heart (Female) πŸ‡ΊπŸ‡Έ": "af_heart",
431
+ "Bella (Female) πŸ‡ΊπŸ‡Έ": "af_bella",
432
+ "Michael (Male) πŸ‡ΊπŸ‡Έ": "am_michael",
433
+ "Puck (Male) πŸ‡ΊπŸ‡Έ": "am_puck",
434
+ "Emma (Female) πŸ‡¬πŸ‡§": "bf_emma",
435
+ "George (Male) πŸ‡¬πŸ‡§": "bm_george"
436
+ }
437
+ selected_voice = st.selectbox("Select Voice", options=list(voice_options.keys()))
438
+ voice_choice = voice_options[selected_voice]
439
+
440
+ # Flag to store intermediate outputs to disk (set to True by default)
441
+ store_intermediates = False
442
+
443
+ if uploaded_pdf is not None:
444
+ if uploaded_pdf.size < 5000:
445
+ st.error("Input does not appear to be a valid academic paper.")
446
+ st.stop()
447
+ st.session_state["uploaded_pdf_tts"] = uploaded_pdf
448
+ st.success("PDF uploaded successfully. πŸ‘")
449
+
450
+ # Load generation models
451
+ title_model_name, title_generation_config = get_generation_model("flash")
452
+ default_model_name, default_generation_config = get_generation_model("thinking")
453
+
454
+ progress_bar = st.progress(0)
455
+
456
+ async def process_tts_readout():
457
+ temp_dir = setup_temp_directories()
458
+ try:
459
+ pdf_basename = os.path.splitext(uploaded_pdf.name)[0]
460
+ st.session_state["pdf_basename_tts"] = pdf_basename
461
+ temp_pdf_path = os.path.join(temp_dir, "uploaded.pdf")
462
+ with open(temp_pdf_path, "wb") as f:
463
+ f.write(uploaded_pdf.getbuffer())
464
+ progress_bar.progress(10)
465
+
466
+ # Upload PDF to Gemini and wait for processing
467
+ pdf_file = upload_to_gemini(temp_pdf_path, mime_type="application/pdf")
468
+ wait_for_files_active([pdf_file])
469
+ progress_bar.progress(20)
470
+
471
+ # Validate the academic paper via title/reference check
472
+ with st.spinner("Validating academic paper..."):
473
+ title_ref = await generate_title_reference_and_classification(
474
+ pdf_file, title_model_name, title_generation_config
475
+ )
476
+ if title_ref.error:
477
+ st.error("Uploaded PDF is not a valid academic paper: " + title_ref.error)
478
+ st.stop()
479
+ progress_bar.progress(30)
480
+
481
+ # Step 1: Generate TTS Outline
482
+ with st.spinner("Generating TTS Outline..."):
483
+ plain_tts_outline_prompt = load_prompt("prompts/plain_TTS_outline.prompt")
484
+ outline_output = await async_generate_text(
485
+ plain_tts_outline_prompt, pdf_file,
486
+ model_name=default_model_name,
487
+ generation_config=default_generation_config
488
+ )
489
+ progress_bar.progress(50)
490
+ if store_intermediates:
491
+ save_intermediate_output(outline_output, pdf_basename, "tts_outline")
492
+
493
+ # Step 2: Generate TTS Draft using the outline
494
+ with st.spinner("Generating TTS Draft..."):
495
+ plain_tts_draft_prompt = load_prompt("prompts/plain_TTS_draft.prompt")
496
+ combined_draft_prompt = outline_output + "\n\n" + plain_tts_draft_prompt
497
+ draft_output = await async_generate_text(
498
+ combined_draft_prompt, pdf_file,
499
+ model_name=default_model_name,
500
+ generation_config=default_generation_config
501
+ )
502
+ progress_bar.progress(70)
503
+ if store_intermediates:
504
+ save_intermediate_output(draft_output, pdf_basename, "tts_draft")
505
+
506
+ # Step 3: Finalize the readout via Q&A
507
+ with st.spinner("Finalizing TTS Readout..."):
508
+ plain_tts_qa_prompt = load_prompt("prompts/plain_TTS_QA.prompt")
509
+ combined_qa_prompt = draft_output + "\n\n" + plain_tts_qa_prompt
510
+ final_output = await async_generate_text(
511
+ combined_qa_prompt, pdf_file,
512
+ model_name=default_model_name,
513
+ generation_config=default_generation_config
514
+ )
515
+ progress_bar.progress(90)
516
+ if store_intermediates:
517
+ save_intermediate_output(final_output, pdf_basename, "tts_final")
518
+
519
+ # Generate audio using the selected voice
520
+ with st.spinner("Generating audio..."):
521
+ audio_mp3_data = generate_tts_audio(final_output, voice=voice_choice, speed=1.0)
522
+ progress_bar.progress(100)
523
+
524
+ st.audio(audio_mp3_data, format="audio/mp3")
525
+ st.download_button(
526
+ label="Download TTS Audio",
527
+ data=audio_mp3_data,
528
+ file_name=f"{pdf_basename}_tts_audio.mp3",
529
+ mime="audio/mp3"
530
+ )
531
+ except Exception as e:
532
+ st.error("Error during TTS readout generation: " + str(e))
533
+ finally:
534
+ cleanup_temp_files(temp_dir)
535
+
536
+ if st.button("Generate TTS Readout Audio"):
537
+ asyncio.run(process_tts_readout())
prompts/plain_TTS_QA.prompt CHANGED
@@ -1,46 +1,35 @@
1
- You are a quality assurance specialist for academic Text-to-Speech (TTS) content. Your task is to produce a final, production-ready TTS version that meets all quality standards while maintaining the integrity of the original content.
2
 
3
  Input:
4
  1. Original academic paper (complete text)
5
- 2. TTS-adapted version of the paper
6
 
7
  Quality Assurance Process:
 
 
 
 
 
8
 
9
- 1. Content Verification and Reorganization
10
- - Remove all references sections at the end of the document
11
- - Remove all table listings at the end of the document
12
- - Consolidate any split sections (e.g., "Discussion (continued)") into single, continuous sections
13
- - Maintain the logical flow of the original while ensuring a continuous reading experience
14
- - Verify no critical arguments or findings were omitted
 
15
 
16
- 2. TTS-Specific Optimization
17
- - Replace all non-TTS-friendly characters:
18
- β€’ Convert all < to "less than" and > to "greater than"
19
- β€’ Convert special characters (Ξ², Ξ±, ΞΌ, Ξ”, etc.) to spoken form
20
- β€’ Spell out equations in verbal form
21
- β€’ Remove all formatting characters (*, #, _, etc.)
22
- - Convert all parenthetical citations to spoken format:
23
- β€’ Example: "(Smith et al., 2020)" β†’ "as Smith and colleagues demonstrated in 2020"
24
- β€’ Use varied phrasing for natural flow
25
- - Ensure acronyms are properly expanded at first use
26
- - Adapt table/figure references for audio context
27
-
28
- 3. Output Format Requirements
29
- - Provide a single, continuous document without section splits
30
- - Use clear section headings as ## [Section Title]
31
- - DO NOT include:
32
- β€’ References section
33
- β€’ Tables section at the end
34
- β€’ Figures section at the end
35
- β€’ Acknowledgements section
36
- β€’ Author information
37
- - Ensure transitions between merged sections flow naturally
38
 
39
  4. Final Deliverable
40
- - You MUST provide the complete, corrected text as your output
41
- - Do not just describe what needs to be fixed
42
- - Do not summarize your changes
43
- - The entire paper should be presented in its final, TTS-ready form
44
- - The document should be fully ready for direct input into a TTS system
45
 
46
- Important: Your output should be ONLY the final, production-ready TTS version with all corrections applied. Do not include analysis, summaries of changes, or section-by-section QA reviews in your output. The text should flow as a continuous, complete document optimized for audio consumption.
 
1
+ You are a quality assurance specialist for academic Text-to-Speech (TTS) content. Your task is to review and finalize a draft TTS adaptation, focusing especially on handling tables correctly and removing unnecessary elements.
2
 
3
  Input:
4
  1. Original academic paper (complete text)
5
+ 2. TTS-adapted draft version of the paper
6
 
7
  Quality Assurance Process:
8
+ 1. Table Content Handling
9
+ - Convert all tabular information into narrative paragraphs
10
+ - For comparative tables, create flowing text that highlights key comparisons
11
+ - Use phrases like "Comparing the studies..." or "The research shows several patterns across..."
12
+ - Ensure all valuable data from tables is preserved in spoken form
13
 
14
+ 2. Content Verification
15
+ - Remove the references section completely
16
+ - Remove any partial or complete table listings
17
+ - Ensure no section headings contain special characters or formatting
18
+ - Verify all temperatures are properly written out (e.g., "negative 80 degrees Celsius")
19
+ - Check that all acronyms are properly expanded at first use
20
+ - Confirm all parenthetical citations are converted to natural spoken format
21
 
22
+ 3. Audio-Friendly Formatting
23
+ - Replace ALL special characters with spoken equivalents
24
+ - Ensure section headings are in plain text format with no special characters
25
+ - Verify transitions between sections flow naturally
26
+ - Check that all content is presented in complete sentences
27
+ - Confirm no markup, formatting codes, or non-verbal elements remain
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  4. Final Deliverable
30
+ - Provide the complete, corrected text as your output
31
+ - The document must contain ZERO special characters, hashtags, asterisks, or formatting codes
32
+ - Every element must be in a form that can be read aloud naturally
33
+ - The entire paper should be presented as continuous, flowing text
 
34
 
35
+ Important: Your output should be ONLY the final, production-ready TTS version with all corrections applied. Do not include explanations of your changes or QA notes. The text should flow as a continuous, complete document that a TTS system can read without encountering any non-verbal elements or requiring human interpretation.
prompts/plain_TTS_draft.prompt CHANGED
@@ -1,47 +1,63 @@
1
  You are an expert in creating TTS-friendly versions of academic papers. You're now in PHASE 2 - EXECUTION, where you'll transform an academic paper into audio-optimized content following the conversion plan created in Phase 1.
2
 
3
  PHASE 2 - EXECUTION (CURRENT TASK):
4
- Using both the original academic paper and the JSON conversion plan, create a TTS-optimized version while preserving the original language wherever possible.
5
 
6
  Instructions for Phase 2:
7
-
8
  1. Input requirements:
9
  - The original academic paper text
10
- - The JSON conversion plan from Phase 1
11
 
12
- 2. For each section in the conversion_plan:
13
- - Locate the section using the content_markers (start/end text)
14
- - Apply only the necessary modifications specified in tts_conversion_instructions
15
  - Preserve original wording wherever it doesn't hinder audio comprehension
 
16
 
17
- 3. Make minimal, targeted transformations limited to:
18
  - Converting parenthetical citations into spoken form
19
  β€’ Example: "(Smith et al., 2020)" β†’ "as Smith and colleagues demonstrated in 2020"
20
- - Adapting table/figure references for audio context
21
- β€’ Only modify how tables/figures are referenced, not the surrounding analysis
22
  - Spelling out symbols, equations and non-standard characters
23
  β€’ Example: "p<0.05" β†’ "p less than 0.05"
24
  - Adding minimal transition words between sections only when necessary for audio flow
25
 
26
- 4. DO NOT modify:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  - Technical terminology or field-specific vocabulary
28
- - Sentence structure unless absolutely necessary for audio comprehension
29
  - The author's original arguments, assertions, or conclusions
30
  - Any content that already works well in spoken form
31
 
32
- 5. Output format:
33
  - Create a single comprehensive document with clearly marked sections
34
  - Each section should follow this structure:
35
  ```
36
- ## [Section Title]
37
 
38
- [Minimally-modified content for TTS]
39
  ```
 
40
 
41
- 6. Final output should:
42
- - Remain as close to verbatim as possible
43
  - Only modify elements that specifically hinder TTS delivery
44
  - Maintain the exact same information, tone, and academic level
45
  - Be indistinguishable from the original in terms of content and meaning
 
46
 
47
- Note: The principle of minimal intervention should guide all transformations. Only modify text when necessary for audio clarity. The goal is a verbatim conversion with just enough adaptation to work in audio format.
 
1
  You are an expert in creating TTS-friendly versions of academic papers. You're now in PHASE 2 - EXECUTION, where you'll transform an academic paper into audio-optimized content following the conversion plan created in Phase 1.
2
 
3
  PHASE 2 - EXECUTION (CURRENT TASK):
4
+ Using the original academic paper, create a TTS-optimized version while preserving the original language wherever possible.
5
 
6
  Instructions for Phase 2:
 
7
  1. Input requirements:
8
  - The original academic paper text
9
+ - The outline with a plan for the transformation.
10
 
11
+ 2. For each section in the paper:
12
+ - Apply only necessary modifications for audio clarity
 
13
  - Preserve original wording wherever it doesn't hinder audio comprehension
14
+ - Maintain the logical flow and academic integrity of the content
15
 
16
+ 3. Make targeted transformations limited to:
17
  - Converting parenthetical citations into spoken form
18
  β€’ Example: "(Smith et al., 2020)" β†’ "as Smith and colleagues demonstrated in 2020"
19
+ β€’ Only include the first citation when multiple citations appear for the same point
 
20
  - Spelling out symbols, equations and non-standard characters
21
  β€’ Example: "p<0.05" β†’ "p less than 0.05"
22
  - Adding minimal transition words between sections only when necessary for audio flow
23
 
24
+ 4. Table transformation requirements (CRITICAL):
25
+ - ALL tables must be completely transformed into narrative paragraphs
26
+ - Do not preserve ANY tabular structure, column headings, or row formats
27
+ - For comparative tables showing multiple studies/methods:
28
+ β€’ Begin with a transition phrase like "Comparing the key studies in this review..."
29
+ β€’ Organize information by meaningful patterns (chronological, methodological similarities, or finding categories)
30
+ β€’ Highlight comparative elements: "While Study A found X, Study B demonstrated Y"
31
+ β€’ Ensure all critical data points are preserved in the narrative
32
+ - For data tables:
33
+ β€’ Convert into descriptive paragraphs that present the patterns and relationships
34
+ β€’ Use natural language to describe trends, comparisons, and outliers
35
+ β€’ Maintain the analytical insights from the original table
36
+
37
+ 5. Special handling for other complex elements:
38
+ - For references section: Omit the final reference list entirely as it's not suitable for audio
39
+ - For figures/visuals: Briefly describe what they would show, then focus on the insights they provide
40
+
41
+ 6. DO NOT modify:
42
  - Technical terminology or field-specific vocabulary
 
43
  - The author's original arguments, assertions, or conclusions
44
  - Any content that already works well in spoken form
45
 
46
+ 7. Output format:
47
  - Create a single comprehensive document with clearly marked sections
48
  - Each section should follow this structure:
49
  ```
50
+ Section Title
51
 
52
+ [Audio-optimized content]
53
  ```
54
+ - Do not include any formatting characters such as #, *, _, or other markdown symbols
55
 
56
+ 8. Final output should:
57
+ - Remain as close to verbatim as possible in non-tabular content
58
  - Only modify elements that specifically hinder TTS delivery
59
  - Maintain the exact same information, tone, and academic level
60
  - Be indistinguishable from the original in terms of content and meaning
61
+ - Contain absolutely no special characters, formatting codes, or tabular structures
62
 
63
+ Note: The principle of minimal intervention should guide all transformations. Your goal is to create an audio-friendly version that maintains the scholarly integrity of the original while enabling smooth TTS delivery. The entire paper should be presented in its final, TTS-ready form with no special characters that cannot be read aloud.
prompts/plain_TTS_outline.prompt CHANGED
@@ -4,63 +4,60 @@ PHASE 1 - ANALYSIS AND PLANNING (CURRENT TASK):
4
  Analyze the provided academic paper and create a structured JSON plan for its conversion to TTS format. This plan will serve as instructions for the actual conversion in Phase 2.
5
 
6
  Instructions for Phase 1:
7
-
8
  Identify and map the paper's structure:
9
-
10
- Create a logical outline of all major sections and subsections
11
-
12
- Note where figures, tables, equations, and citations appear
13
-
14
- Identify sections to exclude (references, acknowledgments, etc.)
15
 
16
  Output a JSON planning document with this structure:
 
17
  {
18
- "paper_metadata": {
19
- "title": "Title of the paper",
20
- "authors": ["Author 1 Name", "Author 2 Name", ...],
21
- "publication_details": "e.g., Journal Name, Year (if available)"
22
- },
23
- "conversion_plan": [
24
- {
25
- "section_id": "unique_identifier",
26
- "section_title": "Section Title",
27
- "section_type": "abstract|introduction|methodology|results|discussion|conclusion|etc",
28
- "content_markers": {
29
- "start": "First 5-7 words of section...",
30
- "end": "...last 5-7 words of section"
31
- },
32
- "tts_conversion_instructions": [
33
- "Specific instruction for handling this section",
34
- "Handle X citations in paragraph 2",
35
- "Narrate Table Y findings",
36
- "Simplify equation discussion in paragraph Z"
37
- ],
38
- "special_elements": [
39
- {
40
- "element_type": "citation|table|figure|equation",
41
- "location": "Paragraph number or descriptive location",
42
- "handling_strategy": "How this element should be converted"
43
- }
44
- ]
45
- }
46
- ],
47
- "global_conversion_guidelines": [
48
- "General principle 1 for the entire document",
49
- "General principle 2 for the entire document"
50
- ]
51
  }
 
52
 
53
  For each section, provide clear content_markers using the first and last few words to help locate the section boundaries.
54
 
55
  For tts_conversion_instructions, be specific about:
56
-
57
- How to handle citations (e.g., "(Smith et al., 2019)" β†’ "as Smith and colleagues found in 2019")
58
-
59
- How to narrate tables/figures (focus on interpreting findings rather than describing visuals)
60
-
61
- How to simplify complex terminology or equations
62
-
63
- How to improve flow between paragraphs or concepts
64
 
65
  PHASE 2 - EXECUTION (FUTURE TASK):
66
  In the next step, these instructions will be used to transform the actual content into TTS-friendly text, following all the specific guidelines provided in Phase 1.
 
4
  Analyze the provided academic paper and create a structured JSON plan for its conversion to TTS format. This plan will serve as instructions for the actual conversion in Phase 2.
5
 
6
  Instructions for Phase 1:
 
7
  Identify and map the paper's structure:
8
+ - Create a logical outline of all major sections and subsections
9
+ - Note where figures, tables, equations, and citations appear
10
+ - Identify sections to exclude (references, acknowledgments, etc.)
 
 
 
11
 
12
  Output a JSON planning document with this structure:
13
+ ```
14
  {
15
+ "paper_metadata": {
16
+ "title": "Title of the paper",
17
+ "authors": ["Author 1 Name", "Author 2 Name", ...],
18
+ "publication_details": "e.g., Journal Name, Year (if available)"
19
+ },
20
+ "conversion_plan": [
21
+ {
22
+ "section_id": "unique_identifier",
23
+ "section_title": "Section Title",
24
+ "section_type": "abstract|introduction|methodology|results|discussion|conclusion|etc",
25
+ "content_markers": {
26
+ "start": "First 5-7 words of section...",
27
+ "end": "...last 5-7 words of section"
28
+ },
29
+ "tts_conversion_instructions": [
30
+ "Specific instruction for handling this section",
31
+ "Handle X citations in paragraph 2",
32
+ "Narrate Table Y findings",
33
+ "Simplify equation discussion in paragraph Z"
34
+ ],
35
+ "special_elements": [
36
+ {
37
+ "element_type": "citation|table|figure|equation",
38
+ "location": "Paragraph number or descriptive location",
39
+ "handling_strategy": "How this element should be converted"
40
+ }
41
+ ]
42
+ }
43
+ ],
44
+ "global_conversion_guidelines": [
45
+ "General principle 1 for the entire document",
46
+ "General principle 2 for the entire document"
47
+ ]
48
  }
49
+ ```
50
 
51
  For each section, provide clear content_markers using the first and last few words to help locate the section boundaries.
52
 
53
  For tts_conversion_instructions, be specific about:
54
+ - How to handle citations (e.g., "(Smith et al., 2019)" β†’ "as Smith and colleagues found in 2019")
55
+ - How to handle figures and tables:
56
+ * For figures: Include a brief 1-2 sentence description using pattern "The paper presents Figure X, which shows [key visual element]." Then focus on the insights: "This figure illustrates that..."
57
+ * For simple tables: Summarize in a short paragraph using pattern "The paper now presents Table Y - a comparison of [key elements]. The main findings show that..."
58
+ * For complex tables (e.g., regression results): Use pattern "Table Z presents regression results that demonstrate [1-2 key conclusions]" without reading individual values
59
+ - For mathematical formulas: Note their presence without reading them verbatim: "The paper includes a mathematical expression for [concept]" then explain the conclusion or implication
60
+ - How to improve flow between paragraphs or concepts
 
61
 
62
  PHASE 2 - EXECUTION (FUTURE TASK):
63
  In the next step, these instructions will be used to transform the actual content into TTS-friendly text, following all the specific guidelines provided in Phase 1.
utils/__pycache__/review_flow.cpython-311.pyc CHANGED
Binary files a/utils/__pycache__/review_flow.cpython-311.pyc and b/utils/__pycache__/review_flow.cpython-311.pyc differ