freddyaboulton HF staff commited on
Commit
30a4774
·
verified ·
1 Parent(s): a0ba2e3

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. README.md +9 -6
  2. app.py +207 -0
  3. requirements.txt +4 -0
README.md CHANGED
@@ -1,12 +1,15 @@
1
  ---
2
- title: Gemini Audio Video
3
- emoji:
4
- colorFrom: blue
5
- colorTo: pink
6
  sdk: gradio
7
- sdk_version: 5.17.1
8
  app_file: app.py
9
  pinned: false
 
 
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Talk to Gemini
3
+ emoji: ♊️
4
+ colorFrom: purple
5
+ colorTo: red
6
  sdk: gradio
7
+ sdk_version: 5.16.0
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
11
+ short_description: Talk to Gemini using Google's multimodal API
12
+ tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|GEMINI_API_KEY]
13
  ---
14
 
15
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import base64
3
+ import os
4
+ import time
5
+ from io import BytesIO
6
+
7
+ import gradio as gr
8
+ from gradio.utils import get_space
9
+ import numpy as np
10
+ from google import genai
11
+ from dotenv import load_dotenv
12
+ from fastrtc import (
13
+ AsyncAudioVideoStreamHandler,
14
+ Stream,
15
+ get_twilio_turn_credentials,
16
+ WebRTC,
17
+ )
18
+ from PIL import Image
19
+
20
+ load_dotenv()
21
+
22
+
23
+ def encode_audio(data: np.ndarray) -> dict:
24
+ """Encode Audio data to send to the server"""
25
+ return {
26
+ "mime_type": "audio/pcm",
27
+ "data": base64.b64encode(data.tobytes()).decode("UTF-8"),
28
+ }
29
+
30
+
31
+ def encode_image(data: np.ndarray) -> dict:
32
+ with BytesIO() as output_bytes:
33
+ pil_image = Image.fromarray(data)
34
+ pil_image.save(output_bytes, "JPEG")
35
+ bytes_data = output_bytes.getvalue()
36
+ base64_str = str(base64.b64encode(bytes_data), "utf-8")
37
+ return {"mime_type": "image/jpeg", "data": base64_str}
38
+
39
+
40
+ class GeminiHandler(AsyncAudioVideoStreamHandler):
41
+ def __init__(
42
+ self,
43
+ ) -> None:
44
+ super().__init__(
45
+ "mono",
46
+ output_sample_rate=24000,
47
+ output_frame_size=480,
48
+ input_sample_rate=16000,
49
+ )
50
+ self.audio_queue = asyncio.Queue()
51
+ self.video_queue = asyncio.Queue()
52
+ self.quit = asyncio.Event()
53
+ self.session = None
54
+ self.last_frame_time = 0
55
+ self.quit = asyncio.Event()
56
+
57
+ def copy(self) -> "GeminiHandler":
58
+ return GeminiHandler()
59
+
60
+ async def start_up(self):
61
+ client = genai.Client(
62
+ api_key=os.getenv("GEMINI_API_KEY"), http_options={"api_version": "v1alpha"}
63
+ )
64
+ config = {"response_modalities": ["AUDIO"]}
65
+ try:
66
+ async with client.aio.live.connect(
67
+ model="gemini-2.0-flash-exp", config=config
68
+ ) as session:
69
+ self.session = session
70
+ print("set session")
71
+ while not self.quit.is_set():
72
+ turn = self.session.receive()
73
+ async for response in turn:
74
+ if data := response.data:
75
+ audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
76
+ self.audio_queue.put_nowait(audio)
77
+ except Exception as e:
78
+ import traceback
79
+
80
+ traceback.print_exc()
81
+
82
+ async def video_receive(self, frame: np.ndarray):
83
+ try:
84
+ print("out")
85
+ if self.session:
86
+ print("here")
87
+ # send image every 1 second
88
+ print(time.time() - self.last_frame_time)
89
+ if time.time() - self.last_frame_time > 1:
90
+ self.last_frame_time = time.time()
91
+ print("sending image")
92
+ await self.session.send(input=encode_image(frame))
93
+ print("sent image")
94
+ if self.latest_args[1] is not None:
95
+ print("sending image2")
96
+ await self.session.send(input=encode_image(self.latest_args[1]))
97
+ print("sent image2")
98
+ except Exception as e:
99
+ print(e)
100
+ import traceback
101
+
102
+ traceback.print_exc()
103
+ self.video_queue.put_nowait(frame)
104
+
105
+ async def video_emit(self):
106
+ return await self.video_queue.get()
107
+
108
+ async def receive(self, frame: tuple[int, np.ndarray]) -> None:
109
+ _, array = frame
110
+ array = array.squeeze()
111
+ audio_message = encode_audio(array)
112
+ if self.session:
113
+ try:
114
+ await self.session.send(input=audio_message)
115
+ except Exception as e:
116
+ print(e)
117
+ import traceback
118
+
119
+ traceback.print_exc()
120
+
121
+ async def emit(self):
122
+ array = await self.audio_queue.get()
123
+ return (self.output_sample_rate, array)
124
+
125
+ async def shutdown(self) -> None:
126
+ if self.session:
127
+ self.quit.set()
128
+ await self.session._websocket.close()
129
+ self.quit.clear()
130
+
131
+
132
+ stream = Stream(
133
+ handler=GeminiHandler(),
134
+ modality="audio-video",
135
+ mode="send-receive",
136
+ rtc_configuration=get_twilio_turn_credentials()
137
+ if get_space() == "spaces"
138
+ else None,
139
+ time_limit=90 if get_space() else None,
140
+ additional_inputs=[
141
+ gr.Image(label="Image", type="numpy", sources=["upload", "clipboard"])
142
+ ],
143
+ ui_args={
144
+ "icon": "https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
145
+ "pulse_color": "rgb(35, 157, 225)",
146
+ "icon_button_color": "rgb(35, 157, 225)",
147
+ "title": "Gemini Audio Video Chat",
148
+ },
149
+ )
150
+
151
+ css = """
152
+ #video-source {max-width: 600px !important; max-height: 600 !important;}
153
+ """
154
+
155
+ with gr.Blocks(css=css) as demo:
156
+ gr.HTML(
157
+ """
158
+ <div style='display: flex; align-items: center; justify-content: center; gap: 20px'>
159
+ <div style="background-color: var(--block-background-fill); border-radius: 8px">
160
+ <img src="https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png" style="width: 100px; height: 100px;">
161
+ </div>
162
+ <div>
163
+ <h1>Gen AI SDK Voice Chat</h1>
164
+ <p>Speak with Gemini using real-time audio + video streaming</p>
165
+ <p>Powered by <a href="https://gradio.app/">Gradio</a> and <a href=https://freddyaboulton.github.io/gradio-webrtc/">WebRTC</a>⚡️</p>
166
+ <p>Get an API Key <a href="https://support.google.com/googleapi/answer/6158862?hl=en">here</a></p>
167
+ </div>
168
+ </div>
169
+ """
170
+ )
171
+ with gr.Row() as row:
172
+ with gr.Column():
173
+ webrtc = WebRTC(
174
+ label="Video Chat",
175
+ modality="audio-video",
176
+ mode="send-receive",
177
+ elem_id="video-source",
178
+ rtc_configuration=get_twilio_turn_credentials()
179
+ if get_space() == "spaces"
180
+ else None,
181
+ icon="https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
182
+ pulse_color="rgb(35, 157, 225)",
183
+ icon_button_color="rgb(35, 157, 225)",
184
+ )
185
+ with gr.Column():
186
+ image_input = gr.Image(
187
+ label="Image", type="numpy", sources=["upload", "clipboard"]
188
+ )
189
+
190
+ webrtc.stream(
191
+ GeminiHandler(),
192
+ inputs=[webrtc, image_input],
193
+ outputs=[webrtc],
194
+ time_limit=60 if get_space() else None,
195
+ concurrency_limit=2 if get_space() else None,
196
+ )
197
+
198
+ stream.ui = demo
199
+
200
+
201
+ if __name__ == "__main__":
202
+ if (mode := os.getenv("MODE")) == "UI":
203
+ stream.ui.launch(server_port=7860)
204
+ elif mode == "PHONE":
205
+ raise ValueError("Phone mode not supported for this demo")
206
+ else:
207
+ stream.ui.launch(server_port=7860)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ fastrtc
2
+ python-dotenv
3
+ google-genai
4
+ twilio