Spaces:

tomekstor9
/

openai-whisper-large-v3-turbo

Sleeping

File size: 3,184 Bytes

10fe697
9fad0db
9da5321
2ff7a4a
10fe697
10dbd97
647ae53
42295c8
10dbd97
eef635f
 
da62944
 
933bac2
 
da62944
 
 
 
933bac2
 
9fad0db
69610ab
 
da62944
 
6838e5c
da62944
 
 
 
 
 
b26b92f
 
42295c8
da62944
933bac2
b26b92f
 
 
69610ab
da62944
b26b92f
da62944
647ae53
da62944
6838e5c
b26b92f
6838e5c
647ae53
42295c8
b26b92f
647ae53
b26b92f
647ae53
 
 
 
 
 
 
 
 
 
b26b92f
 
647ae53
 
 
b26b92f
647ae53
 
 
 
b26b92f
647ae53
b26b92f
647ae53
 
 
42295c8
b26b92f
647ae53
 
 
 
 
9fad0db
647ae53

import gradio as gr
from transformers import pipeline
from pydub import AudioSegment
import os

# Załaduj mniejszy model Whisper do transkrypcji
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-small")

# Załaduj model do tłumaczenia na angielski
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-pl-en")

# Funkcja zmniejszenia jakości audio i konwersji do WAV
def reduce_audio_quality(input_path):
    try:
        audio = AudioSegment.from_file(input_path)
        reduced_audio = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2)
        reduced_path = "reduced_audio.wav"
        reduced_audio.export(reduced_path, format="wav", bitrate="64k")
        return reduced_path
    except Exception as e:
        return None

# Funkcja podziału audio na segmenty 30-sekundowe
def split_audio_to_segments(input_path, segment_length=30):
    audio = AudioSegment.from_file(input_path)
    segments = []
    for i in range(0, len(audio), segment_length * 1000):
        segment = audio[i:i + segment_length * 1000]
        segment_path = f"segment_{i // 1000}.wav"
        segment.export(segment_path, format="wav")
        segments.append(segment_path)
    return segments

# Funkcja przetwarzania pliku z użyciem streaming
def transcribe_audio_stream(file):
    try:
        reduced_audio = reduce_audio_quality(file.name)
        if not reduced_audio:
            yield "Nie udało się zmniejszyć rozmiaru pliku."
            return

        segments = split_audio_to_segments(reduced_audio, segment_length=30)
        full_transcription = ""

        for segment in segments:
            result = transcriber(segment)
            full_transcription += result['text'] + " "
            os.remove(segment)
            yield full_transcription.strip()  # Stream częściowej transkrypcji

        os.remove(reduced_audio)
    except Exception as e:
        yield f"Błąd: {e}"

# Funkcja tłumaczenia poprawionego tekstu
def translate_text(text):
    try:
        translation = translator(text)[0]['translation_text']
        return translation.strip()
    except Exception as e:
        return f"Błąd podczas tłumaczenia: {e}"

# Interfejs Gradio
with gr.Blocks() as app:
    gr.Markdown("## Whisper Small - Transkrypcja i Tłumaczenie")
    gr.Markdown("Prześlij plik audio/wideo, wygeneruj transkrypcję, popraw ją ręcznie i przetłumacz na angielski.")

    with gr.Row():
        file_input = gr.File(label="Prześlij plik audio lub wideo (MOV, MP4, WAV, MP3)")
        transcribe_button = gr.Button("Wykonaj transkrypcję")

    transcription_output = gr.Textbox(label="Transkrypcja tekstowa (edytowalna)", lines=10)
    translate_button = gr.Button("Przetłumacz na angielski")
    translation_output = gr.Textbox(label="Tłumaczenie na angielski", lines=10)

    # Streaming transkrypcji
    transcribe_button.click(
        transcribe_audio_stream, 
        inputs=file_input, 
        outputs=transcription_output
    )

    # Tłumaczenie tekstu po poprawkach
    translate_button.click(
        translate_text,
        inputs=transcription_output,
        outputs=translation_output
    )

app.launch()