jhj0517 commited on
Commit
670baea
·
1 Parent(s): 16a0393

Move vad feature into abstract class

Browse files
modules/whisper/faster_whisper_inference.py CHANGED
@@ -71,20 +71,6 @@ class FasterWhisperInference(WhisperBase):
71
  if not params.hotwords:
72
  params.hotwords = None
73
 
74
- vad_options = None
75
- if params.vad_filter:
76
- # Explicit value set for float('inf') from gr.Number()
77
- if params.max_speech_duration_s >= 9999:
78
- params.max_speech_duration_s = float('inf')
79
-
80
- vad_options = VadOptions(
81
- threshold=params.threshold,
82
- min_speech_duration_ms=params.min_speech_duration_ms,
83
- max_speech_duration_s=params.max_speech_duration_s,
84
- min_silence_duration_ms=params.min_silence_duration_ms,
85
- speech_pad_ms=params.speech_pad_ms
86
- )
87
-
88
  params.suppress_tokens = self.format_suppress_tokens_str(params.suppress_tokens)
89
 
90
  segments, info = self.model.transcribe(
@@ -115,8 +101,6 @@ class FasterWhisperInference(WhisperBase):
115
  language_detection_threshold=params.language_detection_threshold,
116
  language_detection_segments=params.language_detection_segments,
117
  prompt_reset_on_temperature=params.prompt_reset_on_temperature,
118
- vad_filter=params.vad_filter,
119
- vad_parameters=vad_options
120
  )
121
  progress(0, desc="Loading audio..")
122
 
 
71
  if not params.hotwords:
72
  params.hotwords = None
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  params.suppress_tokens = self.format_suppress_tokens_str(params.suppress_tokens)
75
 
76
  segments, info = self.model.transcribe(
 
101
  language_detection_threshold=params.language_detection_threshold,
102
  language_detection_segments=params.language_detection_segments,
103
  prompt_reset_on_temperature=params.prompt_reset_on_temperature,
 
 
104
  )
105
  progress(0, desc="Loading audio..")
106
 
modules/whisper/whisper_base.py CHANGED
@@ -91,12 +91,38 @@ class WhisperBase(ABC):
91
  language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
92
  params.lang = language_code_dict[params.lang]
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  result, elapsed_time = self.transcribe(
95
  audio,
96
  progress,
97
  *astuple(params)
98
  )
99
 
 
 
 
 
 
 
100
  if params.is_diarize:
101
  result, elapsed_time_diarization = self.diarizer.run(
102
  audio=audio,
 
91
  language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
92
  params.lang = language_code_dict[params.lang]
93
 
94
+ speech_chunks = None
95
+ if params.vad_filter:
96
+ # Explicit value set for float('inf') from gr.Number()
97
+ if params.max_speech_duration_s >= 9999:
98
+ params.max_speech_duration_s = float('inf')
99
+
100
+ vad_options = VadOptions(
101
+ threshold=params.threshold,
102
+ min_speech_duration_ms=params.min_speech_duration_ms,
103
+ max_speech_duration_s=params.max_speech_duration_s,
104
+ min_silence_duration_ms=params.min_silence_duration_ms,
105
+ speech_pad_ms=params.speech_pad_ms
106
+ )
107
+
108
+ audio, speech_chunks = self.vad.run(
109
+ audio=audio,
110
+ vad_parameters=vad_options,
111
+ progress=progress
112
+ )
113
+
114
  result, elapsed_time = self.transcribe(
115
  audio,
116
  progress,
117
  *astuple(params)
118
  )
119
 
120
+ if params.vad_filter:
121
+ result = self.vad.restore_speech_timestamps(
122
+ segments=result,
123
+ speech_chunks=speech_chunks,
124
+ )
125
+
126
  if params.is_diarize:
127
  result, elapsed_time_diarization = self.diarizer.run(
128
  audio=audio,