ClearVoice-SR

Running on Zero

App Files Files Community

alibabasglab commited on 21 days ago

Commit

9ec5c63

verified ·

1 Parent(s): 702cf5d

Update app.py

Browse files

Files changed (1) hide show

app.py +2 -82

app.py CHANGED Viewed

@@ -40,47 +40,6 @@ def fn_clearvoice_se(input_wav, sr):
     sf.write('enhanced.wav', output_wav, fs)
     return 'enhanced.wav'
-@spaces.GPU
-def fn_clearvoice_ss(input_wav):
-    myClearVoice = ClearVoice(task='speech_separation', model_names=['MossFormer2_SS_16K'])
-    output_wav_dict = myClearVoice(input_path=input_wav, online_write=False)
-    if isinstance(output_wav_dict, dict):
-        key = next(iter(output_wav_dict))
-        output_wav_list = output_wav_dict[key]
-        output_wav_s1 = output_wav_list[0]
-        output_wav_s2 = output_wav_list[1]
-    else:
-        output_wav_list = output_wav_dict
-        output_wav_s1 = output_wav_list[0]
-        output_wav_s2 = output_wav_list[1]
-    sf.write('separated_s1.wav', output_wav_s1, 16000)
-    sf.write('separated_s2.wav', output_wav_s2, 16000)
-    return "separated_s1.wav", "separated_s2.wav"
-def find_mp4_files(directory):
-    mp4_files = []
-    # Walk through the directory and its subdirectories
-    for root, dirs, files in os.walk(directory):
-        for file in files:
-            # Check if the file ends with .mp4
-            if file.endswith(".mp4") and file[:3] == 'est':
-                mp4_files.append(os.path.join(root, file))
-    return mp4_files
-@spaces.GPU()
-def fn_clearvoice_tse(input_video):
-    myClearVoice = ClearVoice(task='target_speaker_extraction', model_names=['AV_MossFormer2_TSE_16K'])
-    #output_wav_dict =
-    print(f'input_video: {input_video}')
-    myClearVoice(input_path=input_video, online_write=True, output_path='path_to_output_videos_tse')
-    output_list = find_mp4_files(f'path_to_output_videos_tse/AV_MossFormer2_TSE_16K/{os.path.basename(input_video).split(".")[0]}/')
-    return output_list
 demo = gr.Blocks()
 sr_demo = gr.Interface(
@@ -99,52 +58,13 @@ sr_demo = gr.Interface(
               "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> </p>"),
     examples = [
         ["examples/mandarin_speech_16kHz.wav", "16000 Hz"],
         ["examples/english_speech_48kHz.wav", "48000 Hz"],
     ],
     cache_examples = True,
 )
-ss_demo = gr.Interface(
-    fn=fn_clearvoice_ss,
-    inputs = [
-        gr.Audio(label="Input Audio", type="filepath"),
-    ],
-    outputs = [
-        gr.Audio(label="Output Audio", type="filepath"),
-        gr.Audio(label="Output Audio", type="filepath"),
-    ],
-    title = "<a href='https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice' target='_blank'>ClearVoice<a/>: Speech Separation",
-    description = ("ClearVoice ([Github Repo](https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice)) is powered by AI and separates individual speech from mixed audio. It supports 16 kHz and two output streams. "
-                    "To try it, simply upload your audio, or click one of the examples. "),
-    article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions</a> </p>"
-              "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> </p>"),
-    examples = [
-        ['examples/female_female_speech.wav'],
-        ['examples/female_male_speech.wav'],
-    ],
-    cache_examples = True,
-)
-tse_demo = gr.Interface(
-    fn=fn_clearvoice_tse,
-    inputs = [
-        gr.Video(label="Input Video"),
-    ],
-    outputs = [
-        gr.Gallery(label="Output Video List")
-    ],
-    title = "<a href='https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice' target='_blank'>ClearVoice<a/>: Audio-Visual Speaker Extraction",
-    description = ("ClearVoice ([Github Repo](https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice)) is AI-powered and extracts each speaker's voice from a multi-speaker video using facial recognition. "
-                    "To try it, simply upload your video, or click one of the examples. "),
-    # article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions</a> | <a href='https://github.com/alibabasglab/MossFormer' target='_blank'>Github Repo</a></p>"
-    #           "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> | <a href='https://github.com/alibabasglab/MossFormer2' target='_blank'>Github Repo</a></p>"),
-    examples = [
-        ['examples/001.mp4'],
-        ['examples/002.mp4'],
-    ],
-    cache_examples = True,
-)
 with demo:
     gr.TabbedInterface([sr_demo], ["Task 4: Speech Super Resolution"])

     sf.write('enhanced.wav', output_wav, fs)
     return 'enhanced.wav'
 demo = gr.Blocks()
 sr_demo = gr.Interface(
               "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> </p>"),
     examples = [
         ["examples/mandarin_speech_16kHz.wav", "16000 Hz"],
+        ["examples/LJSpeech-001-0001-22k.wav", "22050 Hz"],
+        ["examples/LibriTTS_986_129388_24k.wav", "24000 Hz"]
         ["examples/english_speech_48kHz.wav", "48000 Hz"],
     ],
     cache_examples = True,
 )
 with demo:
     gr.TabbedInterface([sr_demo], ["Task 4: Speech Super Resolution"])