Spaces:

litagin
/

anime-whisper-demo

Running on Zero

File size: 4,478 Bytes

bef66de
 
4d2d3bd
f4fa6cb
bef66de
 
f4fa6cb
d01f68f
 
ab55ccc
bef66de
 
4d2d3bd
 
bef66de
 
 
 
eefa888
 
7a1d956
4d2d3bd
bef66de
 
 
 
0b6b418
bef66de
eefa888
bef66de
 
ab55ccc
d01f68f
 
 
 
2abc41d
5f43bee
d01f68f
 
 
ab55ccc
bef66de
 
d01f68f
5c91bae
4d2d3bd
5c91bae
5331478
 
 
f4fa6cb
3f63243
 
 
 
 
 
 
 
 
 
e8dc53b
bef66de
f4fa6cb
d4d32c6
bef66de
4d2d3bd
5c91bae
bef66de
f4fa6cb
bef66de
5331478
5c91bae
bef66de
 
0b6b418
 
5c91bae
0b6b418
bef66de
 
5c91bae
eefa888
2abc41d
ab55ccc
bef66de
eefa888
bef66de
e8dc53b
0d7f0d9
4d2d3bd
 
0b6b418
409084a
0b6b418
409084a
 
 
eefa888
d50db7b
7a1d956
 
409084a
 
bef66de
 
 
409084a
bef66de
d01f68f
 
eefa888
 
d01f68f
d50db7b
 
bef66de
 
a1694cd
d01f68f
bef66de
 
d01f68f
bef66de
 
eefa888
5c91bae
 
 
 
 
 
0b6b418
bef66de
2abc41d
bef66de

import os
import time
import warnings
from pathlib import Path

import gradio as gr
import librosa
import spaces
import torch
from loguru import logger
from transformers import pipeline

warnings.filterwarnings("ignore")

is_hf = os.getenv("SYSTEM") == "spaces"

generate_kwargs = {
    "language": "Japanese",
    "do_sample": False,
    "num_beams": 1,
    "no_repeat_ngram_size": 5,
    "max_new_tokens": 64,
}


model_dict = {
    "whisper-large-v3-turbo": "openai/whisper-large-v3-turbo",
    "kotoba-whisper-v2.0": "kotoba-tech/kotoba-whisper-v2.0",
    "anime-whisper": "litagin/anime-whisper",
}

logger.info("Initializing pipelines...")
pipe_dict = {
    k: pipeline(
        "automatic-speech-recognition",
        model=v,
        device="cuda" if torch.cuda.is_available() else "cpu",
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    )
    for k, v in model_dict.items()
}
logger.success("Pipelines initialized!")


@spaces.GPU
def transcribe_common(audio: str, model: str) -> str:
    if not audio:
        return "No audio file"
    filename = Path(audio).name
    logger.info(f"Model: {model}")
    logger.info(f"Audio: {filename}")
    # Read and resample audio to 16kHz
    try:
        y, sr = librosa.load(audio, mono=True, sr=16000)
    except Exception as e:
        # First convert to wav if librosa cannot read the file
        logger.error(f"Error reading file: {e}")
        from pydub import AudioSegment

        audio = AudioSegment.from_file(audio)
        audio.export("temp.wav", format="wav")
        y, sr = librosa.load("temp.wav", mono=True, sr=16000)
        Path("temp.wav").unlink()
    # Get duration of audio
    duration = librosa.get_duration(y=y, sr=sr)
    logger.info(f"Duration: {duration:.2f}s")
    if duration > 15:
        logger.error(f"Audio too long, limit is 15 seconds, got {duration:.2f}s")
        return f"Audio too long, limit is 15 seconds, got {duration:.2f}s"
    start_time = time.time()
    result = pipe_dict[model](y, generate_kwargs=generate_kwargs)["text"]
    end_time = time.time()
    logger.success(f"Finished in {end_time - start_time:.2f}s\n{result}")
    return result


def transcribe_others(audio) -> tuple[str, str]:
    result_v3 = transcribe_common(audio, "whisper-large-v3-turbo")
    result_kotoba_v2 = transcribe_common(audio, "kotoba-whisper-v2.0")
    return result_v3, result_kotoba_v2


def transcribe_anime_whisper(audio) -> str:
    return transcribe_common(audio, "anime-whisper")


initial_md = """
# Anime-Whisper Demo

[**Anime Whisper**](https://huggingface.co/litagin/anime-whisper): 5千時間以上のアニメ調セリフと台本でファインチューニングされた日本語音声認識モデルのデモです。句読点や感嘆符がリズムや感情に合わせて自然に付き、NSFW含む非言語発話もうまく台本調に書き起こされます。

- デモでは**音声は15秒まで**しか受け付けません
- 日本語のみ対応 (Japanese only)
- 比較のために [openai/whisper-large-v3-turbo](https://huggingface.co/openai/whisper-large-v3-turbo) と [kotoba-tech/kotoba-whisper-v2.0](https://huggingface.co/kotoba-tech/kotoba-whisper-v2.0) も用意しています

pipeに渡しているkwargsは以下:
```python
generate_kwargs = {
    "language": "Japanese",
    "do_sample": False,
    "num_beams": 1,
    "no_repeat_ngram_size": 5,
    "max_new_tokens": 64,  # 結果が長いときは途中で打ち切られる
}
```
"""

with gr.Blocks() as app:
    gr.Markdown(initial_md)
    audio = gr.Audio(type="filepath")
    with gr.Row():
        with gr.Column():
            gr.Markdown("### Anime-Whisper")
            button_galgame = gr.Button("Transcribe with Anime-Whisper")
            output_galgame = gr.Textbox(label="Result")
    gr.Markdown("### Comparison")
    button_others = gr.Button("Transcribe with other models")
    with gr.Row():
        with gr.Column():
            gr.Markdown("### Whisper-Large-V3-Turbo")
            output_v3 = gr.Textbox(label="Result")
        with gr.Column():
            gr.Markdown("### Kotoba-Whisper-V2.0")
            output_kotoba_v2 = gr.Textbox(label="Result")

    button_galgame.click(
        transcribe_anime_whisper,
        inputs=[audio],
        outputs=[output_galgame],
    )
    button_others.click(
        transcribe_others,
        inputs=[audio],
        outputs=[output_v3, output_kotoba_v2],
    )

app.launch(inbrowser=True)