Spaces:
Sleeping
Sleeping
jhj0517
commited on
Commit
·
670baea
1
Parent(s):
16a0393
Move vad feature into abstract class
Browse files
modules/whisper/faster_whisper_inference.py
CHANGED
@@ -71,20 +71,6 @@ class FasterWhisperInference(WhisperBase):
|
|
71 |
if not params.hotwords:
|
72 |
params.hotwords = None
|
73 |
|
74 |
-
vad_options = None
|
75 |
-
if params.vad_filter:
|
76 |
-
# Explicit value set for float('inf') from gr.Number()
|
77 |
-
if params.max_speech_duration_s >= 9999:
|
78 |
-
params.max_speech_duration_s = float('inf')
|
79 |
-
|
80 |
-
vad_options = VadOptions(
|
81 |
-
threshold=params.threshold,
|
82 |
-
min_speech_duration_ms=params.min_speech_duration_ms,
|
83 |
-
max_speech_duration_s=params.max_speech_duration_s,
|
84 |
-
min_silence_duration_ms=params.min_silence_duration_ms,
|
85 |
-
speech_pad_ms=params.speech_pad_ms
|
86 |
-
)
|
87 |
-
|
88 |
params.suppress_tokens = self.format_suppress_tokens_str(params.suppress_tokens)
|
89 |
|
90 |
segments, info = self.model.transcribe(
|
@@ -115,8 +101,6 @@ class FasterWhisperInference(WhisperBase):
|
|
115 |
language_detection_threshold=params.language_detection_threshold,
|
116 |
language_detection_segments=params.language_detection_segments,
|
117 |
prompt_reset_on_temperature=params.prompt_reset_on_temperature,
|
118 |
-
vad_filter=params.vad_filter,
|
119 |
-
vad_parameters=vad_options
|
120 |
)
|
121 |
progress(0, desc="Loading audio..")
|
122 |
|
|
|
71 |
if not params.hotwords:
|
72 |
params.hotwords = None
|
73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
params.suppress_tokens = self.format_suppress_tokens_str(params.suppress_tokens)
|
75 |
|
76 |
segments, info = self.model.transcribe(
|
|
|
101 |
language_detection_threshold=params.language_detection_threshold,
|
102 |
language_detection_segments=params.language_detection_segments,
|
103 |
prompt_reset_on_temperature=params.prompt_reset_on_temperature,
|
|
|
|
|
104 |
)
|
105 |
progress(0, desc="Loading audio..")
|
106 |
|
modules/whisper/whisper_base.py
CHANGED
@@ -91,12 +91,38 @@ class WhisperBase(ABC):
|
|
91 |
language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
|
92 |
params.lang = language_code_dict[params.lang]
|
93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
result, elapsed_time = self.transcribe(
|
95 |
audio,
|
96 |
progress,
|
97 |
*astuple(params)
|
98 |
)
|
99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
if params.is_diarize:
|
101 |
result, elapsed_time_diarization = self.diarizer.run(
|
102 |
audio=audio,
|
|
|
91 |
language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
|
92 |
params.lang = language_code_dict[params.lang]
|
93 |
|
94 |
+
speech_chunks = None
|
95 |
+
if params.vad_filter:
|
96 |
+
# Explicit value set for float('inf') from gr.Number()
|
97 |
+
if params.max_speech_duration_s >= 9999:
|
98 |
+
params.max_speech_duration_s = float('inf')
|
99 |
+
|
100 |
+
vad_options = VadOptions(
|
101 |
+
threshold=params.threshold,
|
102 |
+
min_speech_duration_ms=params.min_speech_duration_ms,
|
103 |
+
max_speech_duration_s=params.max_speech_duration_s,
|
104 |
+
min_silence_duration_ms=params.min_silence_duration_ms,
|
105 |
+
speech_pad_ms=params.speech_pad_ms
|
106 |
+
)
|
107 |
+
|
108 |
+
audio, speech_chunks = self.vad.run(
|
109 |
+
audio=audio,
|
110 |
+
vad_parameters=vad_options,
|
111 |
+
progress=progress
|
112 |
+
)
|
113 |
+
|
114 |
result, elapsed_time = self.transcribe(
|
115 |
audio,
|
116 |
progress,
|
117 |
*astuple(params)
|
118 |
)
|
119 |
|
120 |
+
if params.vad_filter:
|
121 |
+
result = self.vad.restore_speech_timestamps(
|
122 |
+
segments=result,
|
123 |
+
speech_chunks=speech_chunks,
|
124 |
+
)
|
125 |
+
|
126 |
if params.is_diarize:
|
127 |
result, elapsed_time_diarization = self.diarizer.run(
|
128 |
audio=audio,
|