File size: 3,177 Bytes
dcb12bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import gradio as gr
from moviepy.editor import VideoFileClip
import cv2
import base64
from openai import OpenAI
import os

# ref: https://cookbook.openai.com/examples/gpt4o/introduction_to_gpt4o
def process_video(video_path, seconds_per_frame=2):
    base64Frames = []
    base_video_path, _ = os.path.splitext(video_path)
    video = cv2.VideoCapture(video_path)
    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = video.get(cv2.CAP_PROP_FPS)
    frames_to_skip = int(fps * seconds_per_frame)
    curr_frame = 0

    while curr_frame < total_frames - 1:
        video.set(cv2.CAP_PROP_POS_FRAMES, curr_frame)
        success, frame = video.read()
        if not success:
            break
        _, buffer = cv2.imencode(".jpg", frame)
        base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
        curr_frame += frames_to_skip
    video.release()

    audio_path = f"{base_video_path}.mp3"
    clip = VideoFileClip(video_path)
    clip.audio.write_audiofile(audio_path, bitrate="32k")
    clip.audio.close()
    clip.close()

    return base64Frames, audio_path


def summarize_video(api_key, file_path):
    client = OpenAI(api_key=api_key)

    # フレームと音声を抽出 (秒間0.5フレーム)
    base64Frames, audio_path = process_video(file_path, seconds_per_frame=0.5)

    # Whisperで音声を文字起こし
    transcription = client.audio.transcriptions.create(
        model="whisper-1", file=open(audio_path, "rb")
    )

    # GPT-4oで要約生成
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": """あなたは優秀な要約者です。提供された動画とその書き起こしの要約をMarkdown形式で作成してください""",
            },
            {
                "role": "user",
                "content": [
                    "これらは動画から取得されたフレームです",
                    *map(
                        lambda x: {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpg;base64,{x}",
                                "detail": "low",
                            },
                        },
                        base64Frames,
                    ),
                    {
                        "type": "text",
                        "text": f"動画の書き起こしはこちらです: {transcription.text}",
                    },
                ],
            },
        ],
        temperature=0,
    )

    return response.choices[0].message.content

demo = gr.Interface(
    fn=summarize_video,
    inputs=[gr.Textbox(label="OpenAI API Key"), gr.File(label="Upload Video (mp4)")],
    outputs="markdown",
    title="Video Summarizer",
    description="動画をアップロードしOpenAIのAPIキーを入力し動画をアップロードすると要約が生成されます。API使用料にご注意ください。詳細: https://cookbook.openai.com/examples/gpt4o/introduction_to_gpt4o",
)

if __name__ == "__main__":
    demo.launch()