alibabasglab commited on
Commit
9ec5c63
·
verified ·
1 Parent(s): 702cf5d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +2 -82
app.py CHANGED
@@ -40,47 +40,6 @@ def fn_clearvoice_se(input_wav, sr):
40
  sf.write('enhanced.wav', output_wav, fs)
41
  return 'enhanced.wav'
42
 
43
- @spaces.GPU
44
- def fn_clearvoice_ss(input_wav):
45
- myClearVoice = ClearVoice(task='speech_separation', model_names=['MossFormer2_SS_16K'])
46
- output_wav_dict = myClearVoice(input_path=input_wav, online_write=False)
47
- if isinstance(output_wav_dict, dict):
48
- key = next(iter(output_wav_dict))
49
- output_wav_list = output_wav_dict[key]
50
- output_wav_s1 = output_wav_list[0]
51
- output_wav_s2 = output_wav_list[1]
52
- else:
53
- output_wav_list = output_wav_dict
54
- output_wav_s1 = output_wav_list[0]
55
- output_wav_s2 = output_wav_list[1]
56
- sf.write('separated_s1.wav', output_wav_s1, 16000)
57
- sf.write('separated_s2.wav', output_wav_s2, 16000)
58
- return "separated_s1.wav", "separated_s2.wav"
59
-
60
- def find_mp4_files(directory):
61
- mp4_files = []
62
-
63
- # Walk through the directory and its subdirectories
64
- for root, dirs, files in os.walk(directory):
65
- for file in files:
66
- # Check if the file ends with .mp4
67
- if file.endswith(".mp4") and file[:3] == 'est':
68
- mp4_files.append(os.path.join(root, file))
69
-
70
- return mp4_files
71
-
72
-
73
- @spaces.GPU()
74
- def fn_clearvoice_tse(input_video):
75
- myClearVoice = ClearVoice(task='target_speaker_extraction', model_names=['AV_MossFormer2_TSE_16K'])
76
- #output_wav_dict =
77
- print(f'input_video: {input_video}')
78
- myClearVoice(input_path=input_video, online_write=True, output_path='path_to_output_videos_tse')
79
-
80
- output_list = find_mp4_files(f'path_to_output_videos_tse/AV_MossFormer2_TSE_16K/{os.path.basename(input_video).split(".")[0]}/')
81
-
82
- return output_list
83
-
84
  demo = gr.Blocks()
85
 
86
  sr_demo = gr.Interface(
@@ -99,52 +58,13 @@ sr_demo = gr.Interface(
99
  "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> </p>"),
100
  examples = [
101
  ["examples/mandarin_speech_16kHz.wav", "16000 Hz"],
 
 
102
  ["examples/english_speech_48kHz.wav", "48000 Hz"],
103
  ],
104
  cache_examples = True,
105
  )
106
 
107
- ss_demo = gr.Interface(
108
- fn=fn_clearvoice_ss,
109
- inputs = [
110
- gr.Audio(label="Input Audio", type="filepath"),
111
- ],
112
- outputs = [
113
- gr.Audio(label="Output Audio", type="filepath"),
114
- gr.Audio(label="Output Audio", type="filepath"),
115
- ],
116
- title = "<a href='https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice' target='_blank'>ClearVoice<a/>: Speech Separation",
117
- description = ("ClearVoice ([Github Repo](https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice)) is powered by AI and separates individual speech from mixed audio. It supports 16 kHz and two output streams. "
118
- "To try it, simply upload your audio, or click one of the examples. "),
119
- article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions</a> </p>"
120
- "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> </p>"),
121
- examples = [
122
- ['examples/female_female_speech.wav'],
123
- ['examples/female_male_speech.wav'],
124
- ],
125
- cache_examples = True,
126
- )
127
-
128
- tse_demo = gr.Interface(
129
- fn=fn_clearvoice_tse,
130
- inputs = [
131
- gr.Video(label="Input Video"),
132
- ],
133
- outputs = [
134
- gr.Gallery(label="Output Video List")
135
- ],
136
- title = "<a href='https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice' target='_blank'>ClearVoice<a/>: Audio-Visual Speaker Extraction",
137
- description = ("ClearVoice ([Github Repo](https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice)) is AI-powered and extracts each speaker's voice from a multi-speaker video using facial recognition. "
138
- "To try it, simply upload your video, or click one of the examples. "),
139
- # article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions</a> | <a href='https://github.com/alibabasglab/MossFormer' target='_blank'>Github Repo</a></p>"
140
- # "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> | <a href='https://github.com/alibabasglab/MossFormer2' target='_blank'>Github Repo</a></p>"),
141
- examples = [
142
- ['examples/001.mp4'],
143
- ['examples/002.mp4'],
144
- ],
145
- cache_examples = True,
146
- )
147
-
148
  with demo:
149
  gr.TabbedInterface([sr_demo], ["Task 4: Speech Super Resolution"])
150
 
 
40
  sf.write('enhanced.wav', output_wav, fs)
41
  return 'enhanced.wav'
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  demo = gr.Blocks()
44
 
45
  sr_demo = gr.Interface(
 
58
  "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> </p>"),
59
  examples = [
60
  ["examples/mandarin_speech_16kHz.wav", "16000 Hz"],
61
+ ["examples/LJSpeech-001-0001-22k.wav", "22050 Hz"],
62
+ ["examples/LibriTTS_986_129388_24k.wav", "24000 Hz"]
63
  ["examples/english_speech_48kHz.wav", "48000 Hz"],
64
  ],
65
  cache_examples = True,
66
  )
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  with demo:
69
  gr.TabbedInterface([sr_demo], ["Task 4: Speech Super Resolution"])
70