VanYsa commited on
Commit
e48d3ab
·
1 Parent(s): 44b21fc
Files changed (1) hide show
  1. app.py +43 -228
app.py CHANGED
@@ -1,239 +1,54 @@
1
- import gradio as gr
2
- import json
3
- import librosa
4
- import os
5
- import soundfile as sf
6
- import tempfile
7
- import uuid
8
- import transformers
9
  import torch
10
- import time
11
- import spaces
12
-
13
- from nemo.collections.asr.models import ASRModel
14
-
15
- from transformers import GemmaTokenizer, AutoModelForCausalLM
16
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
17
- from threading import Thread
18
 
19
- # Set an environment variable
20
- HF_TOKEN = os.environ.get("HF_TOKEN", None)
21
 
22
-
23
- SAMPLE_RATE = 16000 # Hz
24
- MAX_AUDIO_SECONDS = 40 # wont try to transcribe if longer than this
25
- DESCRIPTION = '''
26
- <div>
27
- <h1 style='text-align: center'>MyAlexa: Voice Chat Assistant</h1>
28
- <p style='text-align: center'>MyAlexa is a demo of a voice chat assistant with chat logs that accepts audio input and outputs an AI response. </p>
29
- <p>This space uses <a href="https://huggingface.co/nvidia/canary-1b"><b>NVIDIA Canary 1B</b></a> for Automatic Speech-to-text Recognition (ASR), <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct"><b>Meta Llama 3 8B Insruct</b></a> for the large language model (LLM) and <a href="https://https://huggingface.co/docs/transformers/en/model_doc/vits"><b>VITS</b></a> for text to speech (TTS).</p>
30
- <p>This demo accepts audio inputs not more than 40 seconds long.</p>
31
- <p>Transcription and responses are limited to the English language.</p>
32
- </div>
33
- '''
34
- PLACEHOLDER = """
35
- <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
36
- <img src="https://i.ibb.co/S35q17Q/My-Alexa-Logo.png" style="width: 80%; max-width: 550px; height: auto; opacity: 0.55; ">
37
- <p style="font-size: 28px; margin-bottom: 2px; opacity: 0.65;">What's on your mind?</p>
38
- </div>
39
- """
40
 
41
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
42
 
43
- ### ASR model
44
- canary_model = ASRModel.from_pretrained("nvidia/canary-1b").to(device)
45
- canary_model.eval()
46
- # make sure beam size always 1 for consistency
47
- canary_model.change_decoding_strategy(None)
48
- decoding_cfg = canary_model.cfg.decoding
49
- decoding_cfg.beam.beam_size = 1
50
- canary_model.change_decoding_strategy(decoding_cfg)
51
-
52
- ### LLM model
53
- # Load the tokenizer and model
54
- tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
55
- llama3_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto") # to("cuda:0")
56
-
57
- if tokenizer.pad_token is None:
58
- tokenizer.pad_token = tokenizer.eos_token
59
-
60
- terminators = [
61
- tokenizer.eos_token_id,
62
- tokenizer.convert_tokens_to_ids("<|eot_id|>")
63
- ]
64
-
65
- ### TTS model
66
-
67
- def convert_audio(audio_filepath, tmpdir, utt_id):
68
- """
69
- Convert all files to monochannel 16 kHz wav files.
70
- Do not convert and raise error if audio is too long.
71
- Returns output filename and duration.
72
- """
73
-
74
- data, sr = librosa.load(audio_filepath, sr=None, mono=True)
75
-
76
- duration = librosa.get_duration(y=data, sr=sr)
77
-
78
- if duration > MAX_AUDIO_SECONDS:
79
- raise gr.Error(
80
- f"This demo can transcribe up to {MAX_AUDIO_SECONDS} seconds of audio. "
81
- "If you wish, you may trim the audio using the Audio viewer in Step 1 "
82
- "(click on the scissors icon to start trimming audio)."
83
- )
84
-
85
- if sr != SAMPLE_RATE:
86
- data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE)
87
-
88
- out_filename = os.path.join(tmpdir, utt_id + '.wav')
89
-
90
- # save output audio
91
- sf.write(out_filename, data, SAMPLE_RATE)
92
-
93
- return out_filename, duration
94
-
95
- def transcribe(audio_filepath):
96
- """
97
- Transcribes a converted audio file.
98
- Set to english language with punctuations.
99
- Returns the transcribed text as a string.
100
- """
101
-
102
- if audio_filepath is None:
103
- raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone")
104
-
105
- utt_id = uuid.uuid4()
106
- with tempfile.TemporaryDirectory() as tmpdir:
107
- converted_audio_filepath, duration = convert_audio(audio_filepath, tmpdir, str(utt_id))
108
-
109
- # make manifest file and save
110
- manifest_data = {
111
- "audio_filepath": converted_audio_filepath,
112
- "source_lang": "en",
113
- "target_lang": "en",
114
- "taskname": "asr",
115
- "pnc": "yes",
116
- "answer": "predict",
117
- "duration": str(duration),
118
- }
119
-
120
- manifest_filepath = os.path.join(tmpdir, f'{utt_id}.json')
121
-
122
- with open(manifest_filepath, 'w') as fout:
123
- line = json.dumps(manifest_data)
124
- fout.write(line + '\n')
125
-
126
- # call transcribe, passing in manifest filepath
127
- output_text = canary_model.transcribe(manifest_filepath)[0]
128
-
129
- return output_text
130
-
131
- def add_message(history, message):
132
- """
133
- Adds the input message in the chatbot.
134
- Returns the updated chatbot history.
135
- """
136
- history.append((message, None))
137
- return history
138
-
139
- def bot(history, message):
140
- """
141
- Gets the bot's response and places the user and bot messages in the chatbot
142
- Returns the appended chatbot history.
143
- """
144
- response = bot_response(message, history)
145
- lines = response.split("\n")
146
- complete_lines = '\n'.join(lines[2:])
147
- answer = ""
148
- for character in complete_lines:
149
- answer += character
150
- new_tuple = list(history[-1])
151
- new_tuple[1] = answer
152
- history[-1] = tuple(new_tuple)
153
- time.sleep(0.05)
154
- yield history
155
- #return history
156
-
157
- @spaces.GPU()
158
- def bot_response(message, history):
159
- """
160
- Generates a streaming response using the llama3-8b model.
161
- Set max_new_tokens = 100, temperature=0.6, and top_p=0.9
162
- Returns the generated response in string format.
163
- """
164
- conversation = []
165
- for user, assistant in history:
166
- conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
167
- conversation.append({"role": "user", "content": message})
168
-
169
- input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(llama3_model.device)
170
-
171
- outputs = llama3_model.generate(
172
- input_ids,
173
- max_new_tokens = 100,
174
- eos_token_id = terminators,
175
- do_sample=True,
176
- temperature=0.6,
177
- top_p=0.9,
178
- pad_token_id=tokenizer.pad_token_id,
179
- )
180
-
181
- out = outputs[0][input_ids.shape[-1]:]
182
-
183
- return tokenizer.decode(out, skip_special_tokens=True)
184
-
185
-
186
- with gr.Blocks(
187
- title="MyAlexa",
188
- css="""
189
- textarea { font-size: 18px;}
190
- """,
191
- theme=gr.themes.Default(text_size=gr.themes.sizes.text_lg) # make text slightly bigger (default is text_md )
192
- ) as demo:
193
-
194
- gr.HTML(DESCRIPTION)
195
- chatbot = gr.Chatbot(
196
- [],
197
- elem_id="chatbot",
198
- bubble_full_width=False,
199
- placeholder=PLACEHOLDER,
200
- label='MyAlexa'
201
- )
202
- with gr.Row():
203
- with gr.Column():
204
- gr.HTML(
205
- "<p><b>Step 1:</b> Upload an audio file or record with your microphone.</p>"
206
- )
207
-
208
- audio_file = gr.Audio(sources=["microphone", "upload"], type="filepath")
209
-
210
-
211
- with gr.Column():
212
 
213
- gr.HTML("<p><b>Step 2:</b> Submit your recorded or uploaded audio as input and wait for MyAlexa's response.</p>")
 
214
 
215
- submit_button = gr.Button(
216
- value="Submit audio",
217
- variant="primary"
218
- )
219
 
220
- chat_input = gr.Textbox(
221
- label="Transcribed text:",
222
- interactive=False,
223
- placeholder="Enter message",
224
- elem_id="chat_input",
225
- visible=True
226
- )
227
 
228
- chat_msg = chat_input.change(add_message, [chatbot, chat_input], [chatbot])
229
- bot_msg = chat_msg.then(bot, [chatbot, chat_input], chatbot, api_name="bot_response")
230
-
231
- submit_button.click(
232
- fn=transcribe,
233
- inputs = [audio_file],
234
- outputs = [chat_input]
235
- )
236
 
237
- demo.queue()
238
- if __name__ == "__main__":
239
- demo.launch()
 
 
 
 
 
 
 
 
 
1
  import torch
 
 
 
 
 
 
 
 
2
 
3
+ from transformers import pipeline
 
4
 
5
+ import numpy as np
6
+ import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
9
 
10
+ pipe_dict = {
11
+ "original_pipe": pipeline("text-to-speech", model="kakao-enterprise/vits-ljs", device=0),
12
+ }
13
+
14
+ # Inference
15
+ def generate_audio(text):
16
+
17
+ output = pipe_dict["original_pipe"](text)
18
+ output = gr.Audio(value = (output["sampling_rate"], output["audio"].squeeze()), type="numpy", autoplay=False, label=f"Prediction from the original checkpoint {"kakao-enterprise/vits-ljs"}", show_label=True,
19
+ visible=True)
20
+
21
+ ###############language = "english"
22
+ return output
23
+
24
+ css = """
25
+ #container{
26
+ margin: 0 auto;
27
+ max-width: 80rem;
28
+ }
29
+ #intro{
30
+ max-width: 100%;
31
+ text-align: center;
32
+ margin: 0 auto;
33
+ }
34
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
+ # Gradio blocks demo
37
+ with gr.Blocks(css=css) as demo_blocks:
38
 
39
+ with gr.Row():
40
+ with gr.Column():
41
+ inp_text = gr.Textbox(label="Input Text", info="What sentence would you like to synthesise?")
42
+ btn = gr.Button("Generate Audio!")
43
 
44
+
45
+ with gr.Column():
46
+ outputs = []
47
+ for i in range(max_speakers):
48
+ out_audio = gr.Audio(type="numpy", autoplay=False, label=f"Generated Audio - speaker {i}", show_label=True, visible=False)
49
+ outputs.append(out_audio)
 
50
 
51
+ btn.click(generate_audio, [inp_text], outputs)
52
+
 
 
 
 
 
 
53
 
54
+ demo_blocks.queue().launch()