Spaces:

laubonghaudoi
/

cantonese-srt

Running on Zero

File size: 1,696 Bytes

1d7163f

import logging
import os
import tempfile
from typing import Iterator

from pysrt import SubRipFile, SubRipItem, SubRipTime
from pytubefix import YouTube

from transcriber import TranscribeResult

logger = logging.getLogger(__name__)


def download_youtube_audio(video_id: str) -> str:
    """
    Download audio from YouTube video.

    Args:
        video_id (str): YouTube video ID.

    Returns:
        str: Path to the downloaded audio file.
    """
    urls = "https://www.youtube.com/watch?v={}".format(video_id)

    try:
        # https://github.com/JuanBindez/pytubefix/issues/242#issuecomment-2369067929
        vid = YouTube(urls, "MWEB")

        if vid.title is None:
            return None

        audio_download = vid.streams.get_audio_only()
        audio_download.download(
            mp3=True,
            filename=video_id,
            output_path=tempfile.gettempdir(),
            skip_existing=True,
        )
        audio_file = tempfile.gettempdir() + "/" + video_id + ".mp3"

        return audio_file

    except Exception as e:
        print(e)
        return None


def to_srt(results: Iterator["TranscribeResult"]) -> str:
    """
    Convert the list of TranscribeResult objects into a SRT file
    """
    srt = SubRipFile()

    for i, t in enumerate(results):
        start = SubRipTime(seconds=t.start_time)
        end = SubRipTime(seconds=t.end_time)
        item = SubRipItem(index=i, start=start, end=end, text=t.text)
        srt.append(item)

    temp_file = tempfile.gettempdir() + "/output.srt"
    srt.save(temp_file)

    with open(temp_file, "r", encoding="utf-8") as f:
        srt_text = f.read()

    os.remove(temp_file)

    return srt_text