File size: 13,843 Bytes
14daa4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
#!/usr/bin/env bash

# CHARTER: This script is designed to process audio/video content, generate transcripts,
# summaries, and audio files. The following tasks are defined and must not be altered by any LLM:
# 1. Download content if a URL is provided
# 2. Generate or use existing VTT transcript
# 3. Clean the VTT transcript
# 4. Generate a summary of the transcript
# 5. Create a 16k VBR Opus OGG file for audio tracks (unless audio download is disabled)
# 6. Output both the cleaned VTT text and the summary
# 7. Exclude the WAV file from the results
# 8. Include the OGG file in the results only if both WAV and OGG were created
# This charter is unalterable and defines the core functionality of the script.

# Configuration (adjust these paths)
WHISPCC="$HOME/work/whisper.cpp" # ./main to run ; ./models for models
MODEL_PATH="$WHISPCC/models/ggml-small.en-tdrz.bin"
OUTPUT_DIR="$HOME/processed_audio"
CACHE_DIR="/tmp/summarize_cache"
OLLAMA_MODEL="llama3.1:latest"
OLLAMA_MODEL="deepseek-coder-v2:16b"

# Prompts for different segments
FIRST_PROMPT="Summarize this beginning part of a transcript in one sentence, then provide bullet points with timestamps (00:00:00 sentence)."
MIDDLE_PROMPT="Summarize the key points of this part of the transcript in bullet points with timestamps (00:00:00 sentence)."
LAST_PROMPT="Summarize the main takeaways of this final part of the transcript in bullet points with timestamps (00:00:00 sentence)."

# Global variable to track job queue
JOB_QUEUE=()

# Ensure output and cache directories exist
mkdir -p "$OUTPUT_DIR" "$CACHE_DIR"

# Parse command line options
USE_FABRIC=false
DISABLE_AUDIO=false
DURATION=""
while getopts "fnad:" opt; do
  case $opt in
    f)
      USE_FABRIC=true
      ;;
    n)
      DISABLE_AUDIO=true
      ;;
    a)
      DISABLE_AUDIO=false
      ;;
    d)
      DURATION="$OPTARG"
      ;;
    \?)
      echo "Invalid option: -$OPTARG" >&2
      exit 1
      ;;
  esac
done
shift $((OPTIND-1))

# Function to get MD5 hash of a file
get_md5() {
    md5sum "$1" | cut -d' ' -f1
}

# Function to cache a file using hardlinks (atomic)
cache_file() {
    local INPUT_FILE="$1"
    local EXTENSION="$2"

    # Check if the input file exists and is not empty
    if [ ! -s "$INPUT_FILE" ]; then
        echo "Error: Input file is empty or does not exist." >&2
        return 1
    fi

    local MD5=$(get_md5 "$INPUT_FILE")
    local CACHE_SUBDIR="$CACHE_DIR/${MD5:0:2}/${MD5:2:2}"
    local SAFE_FILENAME=$(echo "$INPUT_FILE" | sed 's/[^a-zA-Z0-9._-]/_/g')
    local CACHE_FILE="$CACHE_SUBDIR/${MD5}_${SAFE_FILENAME}${EXTENSION}"

    echo "Cache operation: MD5 sum = $MD5" >&2
    echo "Cache file: $CACHE_FILE" >&2

    # Create cache subdirectory if it doesn't exist
    if ! mkdir -p "$CACHE_SUBDIR"; then
        echo "Error: Failed to create cache subdirectory." >&2
        return 1
    fi

    # Attempt to create the hardlink
    if ln -f "$INPUT_FILE" "$CACHE_FILE"; then
        echo "Cache file created: $CACHE_FILE" >&2
        echo "$CACHE_FILE"
        return 0
    else
        echo "Error: Failed to create cache file." >&2
        return 1
    fi
}

# Function to sanitize a string for use as a filename
sanitize_filename() {
    local STRING="$1"
    echo "$STRING" | iconv -c -t ascii//translit | sed 's/[^A-Za-z0-9._-]/_/g' | tr '[:upper:]' '[:lower:]'
}

# Function to clean text from a VTT file
clean_text() {
    sed 's/<[^>]*>//g' | tr -s ' ' | sed 's/^[ \t]*//;s/[ \t]*$//'
}

# Function to summarize a segment of text
summarize_segment() {
    local SEGMENT_TEXT="$1"
    local PROMPT="$2"
    local SUMMARY_OUTPUT=""

    # Count the number of lines in the input
    local LINE_COUNT=$(echo "$SEGMENT_TEXT" | wc -l)

    # If the input has less than 12 lines, remove cache and return a simple response
    if [ "$LINE_COUNT" -lt 12 ]; then
        local MD5=$(echo "$SEGMENT_TEXT" | md5sum | cut -d' ' -f1)
        local CACHE_SUBDIR="$CACHE_DIR/${MD5:0:2}/${MD5:2:2}"
        rm -f "$CACHE_SUBDIR/$MD5"*
        echo "The input is too short for meaningful summarization. Cache entry removed. Here's the original text:"
        echo "$SEGMENT_TEXT"
        return 0
    fi

    if $USE_FABRIC; then
        SUMMARY_OUTPUT=$(fabric -p summarize "$SEGMENT_TEXT" 2>&1)
    else
        # Use ollama for summarization
        SUMMARY_OUTPUT=$(ollama run "$OLLAMA_MODEL" "$PROMPT" "$SEGMENT_TEXT" 2>&1)
    fi

    if [ $? -ne 0 ]; then
        echo "Error in summarization: $SUMMARY_OUTPUT" >&2
        return 1
    fi

    echo "$SUMMARY_OUTPUT"
}

# Function to add a job to the queue
add_job() {
    JOB_QUEUE+=("$@")
}

# Function to update the progress bar for a job
update_job_progress() {
    local JOB_INDEX="$1"
    local TOTAL_STEPS="$2"
    local CURRENT_STEP="$3"
    local JOB_MESSAGE="$4"

    # ... (Implementation for updating the TUI progress bar)
    # You can use a library like 'whiptail' or 'dialog' for TUI elements
    # Example using echo for now:
    echo "Job $((JOB_INDEX+1))/$JOB_COUNT: $JOB_MESSAGE ($CURRENT_STEP/$TOTAL_STEPS)"
}

# Function to process the job queue
process_job_queue() {
    local JOB_COUNT=${#JOB_QUEUE[@]}
    echo "Processing job queue ($JOB_COUNT jobs)..."
     for (( i=0; i<JOB_COUNT; i++ )); do
        # Remove update_job_progress calls
        eval "${JOB_QUEUE[$i]}"
    done
}

# Function to process a single segment
process_segment() {
    local SEGMENT_TEXT="$1"
    local PROMPT="$2"
    local OUTPUT_FILE="$3"
    local SUMMARY_OUTPUT=""

    # Count the number of lines in the input
    local LINE_COUNT=$(echo "$SEGMENT_TEXT" | wc -l)

    # If the input has less than 12 lines, remove cache and return a simple response
    if [ "$LINE_COUNT" -lt 12 ]; then
        local MD5=$(echo "$SEGMENT_TEXT" | md5sum | cut -d' ' -f1)
        local CACHE_SUBDIR="$CACHE_DIR/${MD5:0:2}/${MD5:2:2}"
        rm -f "$CACHE_SUBDIR/$MD5"*
        echo "The input is too short for meaningful summarization. Cache entry removed. Here's the original text:"
        echo "$SEGMENT_TEXT" > "$OUTPUT_FILE"
        return 0
    fi

    if $USE_FABRIC; then
        SUMMARY_OUTPUT=$(fabric -p summarize "$SEGMENT_TEXT" 2>&1)
    else
        # Use ollama for summarization
        SUMMARY_OUTPUT=$(ollama run "$OLLAMA_MODEL" "$PROMPT" "$SEGMENT_TEXT" 2>&1)
    fi

    if [ $? -ne 0 ]; then
        echo "Error in summarization: $SUMMARY_OUTPUT" >&2
        return 1
    fi

    # Write the summary to the specified output file
    echo "$SUMMARY_OUTPUT" > "$OUTPUT_FILE"
}

# Function to process a VTT file (generate summary and handle versioning)
process_vtt() {
    local VTT_FILE=$1
    local URL=$2
    local TEMP_DIR=$(mktemp -d)
    local BASE_NAME="${TEMP_DIR}/temp" # Temporary base name
    local CLEANED_TRANSCRIPT="${BASE_NAME}_cleaned.txt"
    local SUMMARY_FILE="${OUTPUT_DIR}/$(basename "$VTT_FILE" .vtt)_summary.txt"

    echo "Processing VTT file: $VTT_FILE"

    # Clean the VTT transcript
    if ! python3 "$(dirname "$0")/vttclean.py" "$VTT_FILE" > "$CLEANED_TRANSCRIPT" 2>"${CLEANED_TRANSCRIPT}.error"; then
        echo "Error: Failed to clean the VTT file. Error log:" >&2
        cat "${CLEANED_TRANSCRIPT}.error" >&2
        exit 1
    fi

    # Check if the cleaned transcript is empty
    if [ ! -s "$CLEANED_TRANSCRIPT" ]; then
        echo "Error: Cleaned transcript is empty." >&2
        exit 1
    fi

    # Generate summary
    echo "Summarizing transcript..."
    local TOTAL_LINES=$(wc -l < "$CLEANED_TRANSCRIPT")
    local SEGMENT_SIZE=$((TOTAL_LINES / 3))
    local FIRST_SEGMENT=$(head -n $SEGMENT_SIZE "$CLEANED_TRANSCRIPT")
    local MIDDLE_SEGMENT=$(sed -n "$((SEGMENT_SIZE + 1)),$((2 * SEGMENT_SIZE))p" "$CLEANED_TRANSCRIPT")
    local LAST_SEGMENT=$(tail -n $SEGMENT_SIZE "$CLEANED_TRANSCRIPT")

    {
        echo "Generating summary for first segment..."
        if $USE_FABRIC; then
            fabric -p summarize "$FIRST_SEGMENT"
        else
            ollama run "$OLLAMA_MODEL" "$FIRST_PROMPT" "$FIRST_SEGMENT"
        fi

        echo "Generating summary for middle segment..."
        if $USE_FABRIC; then
            fabric -p summarize "$MIDDLE_SEGMENT"
        else
            ollama run "$OLLAMA_MODEL" "$MIDDLE_PROMPT" "$MIDDLE_SEGMENT"
        fi

        echo "Generating summary for last segment..."
        if $USE_FABRIC; then
            fabric -p summarize "$LAST_SEGMENT"
        else
            ollama run "$OLLAMA_MODEL" "$LAST_PROMPT" "$LAST_SEGMENT"
        fi
    } > "$SUMMARY_FILE"

    if [ ! -s "$SUMMARY_FILE" ]; then
        echo "Error: Summary generation failed." >&2
        exit 1
    fi

    echo "Summarization complete."

    # Display the content of the summary file
    echo "Summary content:"
    echo "----------------------------------------"
    cat "$SUMMARY_FILE"
    echo "----------------------------------------"

    # Clean up
    rm -rf "$TEMP_DIR"
}

# Function to calculate the time difference between two timestamps in HH:MM:SS format
time_difference() {
    local TIME1="$1"  # Format: HH:MM:SS
    local TIME2="$2"  # Format: HH:MM:SS

    # Extract hours, minutes, and seconds from timestamps
    local TIME1_HOUR=$(echo "$TIME1" | cut -d: -f1)
    local TIME1_MINUTE=$(echo "$TIME1" | cut -d: -f2)
    local TIME1_SECOND=$(echo "$TIME1" | cut -d: -f3)

    local TIME2_HOUR=$(echo "$TIME2" | cut -d: -f1)
    local TIME2_MINUTE=$(echo "$TIME2" | cut -d: -f2)
    local TIME2_SECOND=$(echo "$TIME2" | cut -d: -f3)

    # Calculate total seconds for each timestamp
    local TIME1_TOTAL_SECONDS=$((TIME1_HOUR * 3600 + TIME1_MINUTE * 60 + TIME1_SECOND))
    local TIME2_TOTAL_SECONDS=$((TIME2_HOUR * 3600 + TIME2_MINUTE * 60 + TIME2_SECOND))

    # Calculate the difference in seconds
    local DIFF_SECONDS=$((TIME1_TOTAL_SECONDS - TIME2_TOTAL_SECONDS))

    # Return the difference (could be negative if TIME2 is later than TIME1)
    echo "$DIFF_SECONDS"
}

# Main script logic
if [ $# -eq 0 ]; then
    echo "Error: No input provided. Please provide a valid URL, VTT file, or a local audio file."
    exit 1
fi

if [[ "$1" == *.vtt ]]; then
    echo "Processing as VTT file..."
    add_job "process_vtt \"$1\" \"$1\""
elif [[ "$1" == *"http"* ]]; then
    echo "Processing as YouTube URL..."

    # Extract the video title
    VIDEO_TITLE=$(yt-dlp --get-title "$1")
    FINAL_BASE_NAME=$(sanitize_filename "$VIDEO_TITLE")

    # Attempt to download subtitles first
    yt-dlp -N 3 --skip-download --write-auto-sub --sub-lang en \
           --cookies-from-browser brave --output "$OUTPUT_DIR/${FINAL_BASE_NAME}.%(ext)s" "$1"

    VTT_FILE=$(find "$OUTPUT_DIR" -name "${FINAL_BASE_NAME}.vtt" | head -n 1)

    if [ -n "$VTT_FILE" ]; then
        echo "Subtitles found, processing VTT file..."
        add_job "process_vtt \"$VTT_FILE\" \"$1\""
    else
        echo "No subtitles found, downloading audio and generating transcript..."
        if [ "$DISABLE_AUDIO" = false ]; then
            if ! yt-dlp -N 3 -x --audio-format wav --postprocessor-args "-ar 16k" \
                   --cookies-from-browser brave --output "$OUTPUT_DIR/${FINAL_BASE_NAME}.%(ext)s" "$1"; then
                echo "Error: Failed to download audio using yt-dlp. Check the URL and your internet connection." >&2
                exit 1
            fi

            WAV_FILE=$(find "$OUTPUT_DIR" -name "${FINAL_BASE_NAME}.wav" | head -n 1)

            if [ -z "$WAV_FILE" ]; then
                echo "Error: WAV file not found after download. Check yt-dlp output." >&2
                exit 1
            fi

            echo "Running Whisper-CPP to generate VTT transcript..."
            if ! "$WHISPCC"/main -ovtt -tdrz -m "$MODEL_PATH" "$WAV_FILE"; then
                echo "Error: Whisper-CPP transcription failed. Check the model path and audio file." >&2
                exit 1
            fi
            VTT_FILE="${WAV_FILE%.*}.vtt"

            add_job "process_vtt \"$VTT_FILE\" \"$1\""

            # Convert WAV to OGG Opus
            echo "Converting WAV to OGG Opus..."
            OGG_FILE="${WAV_FILE%.wav}.ogg"
            if ! ffmpeg -i "$WAV_FILE" -c:a libopus -b:a 16k -vbr on -compression_level 10 -y "$OGG_FILE"; then
                echo "Error: Failed to convert to OGG format." >&2
                exit 1
            fi
            echo " - Audio: $OGG_FILE"
            # Remove the WAV file
            rm "$WAV_FILE"
        fi
    fi
elif [ -f "$1" ]; then
    echo "Processing as local audio file..."
    INPUT_FILE="$1"
    WAV_FILE="${INPUT_FILE%.*}.wav"

    # Convert to WAV first if not already WAV
    if [[ "$INPUT_FILE" != *.wav ]]; then
        echo "Converting input to WAV format..."
        if ! ffmpeg -i "$INPUT_FILE" -ar 16000 -ac 1 -c:a pcm_s16le ${DURATION:+-t "$DURATION"} -y "$WAV_FILE"; then
            echo "Error: Failed to convert input to WAV format." >&2
            exit 1
        fi
    else
        WAV_FILE="$INPUT_FILE"
    fi

    echo "Running Whisper-CPP to generate VTT transcript..."
    if ! "$WHISPCC"/main -ovtt -tdrz -m "$MODEL_PATH" "$WAV_FILE" ; then
        echo "Error: Whisper-CPP transcription failed." >&2
        exit 1
    fi

    VTT_FILE="${WAV_FILE%.wav}.vtt"
    mv "${WAV_FILE}.vtt" "$VTT_FILE"
    add_job "process_vtt \"$VTT_FILE\" \"$1\""

    if [ "$DISABLE_AUDIO" = false ]; then
        # Convert to OGG Opus
        echo "Converting to OGG Opus..."
        OGG_FILE="${WAV_FILE%.*}.ogg"
        if ! ffmpeg -i "$WAV_FILE" -c:a libopus -b:a 16k -vbr on -compression_level 10 -y "$OGG_FILE"; then
            echo "Error: Failed to convert to OGG format." >&2
            exit 1
        fi
        echo " - Audio: $OGG_FILE"
        # Remove the WAV file per CHARTER point 7
        rm "$WAV_FILE"
    fi



else
    echo "Error: Invalid input. Provide a valid URL, VTT file, or a local audio file."
    exit 1
fi

process_job_queue