invincible-jha commited on
Commit
183c824
·
verified ·
1 Parent(s): 2cce526

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +270 -48
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import gradio as gr
2
  import torch
3
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
@@ -8,109 +9,326 @@ import plotly.graph_objects as go
8
  import warnings
9
  import os
10
  from scipy.stats import kurtosis, skew
 
 
11
  warnings.filterwarnings('ignore')
12
 
13
- # Global variables to store models
14
  processor = None
15
  whisper_model = None
16
  emotion_tokenizer = None
17
  emotion_model = None
18
 
19
  def load_models():
20
- """Initialize and load all required models with proper error handling"""
 
 
 
 
 
 
 
 
21
  global processor, whisper_model, emotion_tokenizer, emotion_model
22
 
23
  try:
 
24
  print("Loading Whisper model...")
25
  processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
26
  whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
27
 
 
28
  print("Loading emotion model...")
29
  emotion_tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
30
  emotion_model = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
31
 
32
- # Move models to appropriate device
33
- device = "cpu" # Force CPU usage for stability
34
  whisper_model.to(device)
35
  emotion_model.to(device)
36
 
37
  print("Models loaded successfully!")
38
  return True
 
39
  except Exception as e:
40
  print(f"Error loading models: {str(e)}")
41
  return False
42
 
43
- # Your existing feature extraction functions remain the same
44
  def extract_prosodic_features(waveform, sr):
45
- """Extract prosodic features from audio"""
 
 
 
 
 
 
 
 
 
 
 
 
46
  try:
 
 
 
 
 
47
  features = {}
48
- # ... (rest of your existing function)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  return features
 
50
  except Exception as e:
51
- print(f"Error in extract_prosodic_features: {str(e)}")
52
  return None
53
 
54
  def create_feature_plots(features):
55
- """Create visualizations for audio features"""
 
 
 
 
 
 
 
 
 
 
56
  try:
57
- # ... (rest of your existing function)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  return fig.to_html(include_plotlyjs=True)
 
59
  except Exception as e:
60
  print(f"Error in create_feature_plots: {str(e)}")
61
  return None
62
 
63
  def create_emotion_plot(emotions):
64
- """Create emotion analysis visualization"""
 
 
 
 
 
 
 
65
  try:
66
- # ... (rest of your existing function)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  return fig.to_html(include_plotlyjs=True)
68
  except Exception as e:
69
  print(f"Error in create_emotion_plot: {str(e)}")
70
  return None
71
 
72
  def analyze_audio(audio_input):
73
- """Main function to analyze audio input"""
74
- global processor, whisper_model, emotion_tokenizer, emotion_model
 
 
 
 
 
 
75
 
 
 
 
 
 
 
76
  try:
77
  if audio_input is None:
78
  return "Please provide an audio input", None, None
79
 
80
- print(f"Processing audio input: {type(audio_input)}")
81
-
82
- # Handle audio input
83
  if isinstance(audio_input, tuple):
84
  audio_path = audio_input[0]
85
  else:
86
  audio_path = audio_input
87
 
88
- print(f"Loading audio from path: {audio_path}")
89
-
90
- # Load audio
91
- waveform, sr = librosa.load(audio_path, sr=16000)
92
- print(f"Audio loaded: {waveform.shape}, SR: {sr}")
93
 
 
 
 
94
  # Extract voice features
95
- print("Extracting voice features...")
96
  features = extract_prosodic_features(waveform, sr)
97
  if features is None:
98
- return "Error extracting voice features", None, None
99
 
100
- # Create feature plots
101
- print("Creating feature visualizations...")
102
  feature_viz = create_feature_plots(features)
103
 
104
- # Transcribe audio
105
- print("Transcribing audio...")
106
  inputs = processor(waveform, sampling_rate=sr, return_tensors="pt").input_features
107
-
108
  with torch.no_grad():
109
  predicted_ids = whisper_model.generate(inputs)
110
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
111
 
112
  # Analyze emotions
113
- print("Analyzing emotions...")
114
  emotion_inputs = emotion_tokenizer(
115
  transcription,
116
  return_tensors="pt",
@@ -131,7 +349,7 @@ def analyze_audio(audio_input):
131
 
132
  emotion_viz = create_emotion_plot(emotion_scores)
133
 
134
- # Create analysis summary
135
  summary = f"""Voice Analysis Summary:
136
 
137
  Speech Content:
@@ -144,6 +362,9 @@ Voice Characteristics:
144
  - Voice Energy: {features['energy_mean']:.4f}
145
 
146
  Dominant Emotion: {max(emotion_scores.items(), key=lambda x: x[1])[0]}
 
 
 
147
  """
148
 
149
  return summary, emotion_viz, feature_viz
@@ -158,15 +379,14 @@ try:
158
  print("===== Application Startup =====")
159
  if not load_models():
160
  raise RuntimeError("Failed to load required models")
161
- print("Models loaded successfully, creating Gradio interface...")
162
 
163
- # Create Gradio interface
164
  demo = gr.Interface(
165
  fn=analyze_audio,
166
  inputs=gr.Audio(
167
  sources=["microphone", "upload"],
168
  type="filepath",
169
- label="Audio Input"
170
  ),
171
  outputs=[
172
  gr.Textbox(label="Analysis Summary", lines=10),
@@ -175,33 +395,35 @@ try:
175
  ],
176
  title="Voice Analysis System",
177
  description="""
178
- This application analyzes voice recordings to extract various characteristics:
179
 
180
  1. Voice Features:
181
- - Pitch analysis
182
- - Energy patterns
183
- - Speech rate
184
- - Voice quality
185
 
186
  2. Emotional Content:
187
- - Emotion detection
188
- - Emotional intensity
189
 
190
  3. Speech Content:
191
- - Text transcription
192
 
 
 
 
 
 
 
193
  Upload an audio file or record directly through your microphone.
194
- """,
195
- examples=None,
196
- cache_examples=False
197
  )
198
 
199
- print("Gradio interface created successfully")
200
-
201
  # Launch the interface
202
  if __name__ == "__main__":
203
- print("Launching application...")
204
  demo.launch()
 
205
  except Exception as e:
206
  print(f"Error during application startup: {str(e)}")
207
  raise
 
1
+ # Import necessary libraries for the voice analysis system
2
  import gradio as gr
3
  import torch
4
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
 
9
  import warnings
10
  import os
11
  from scipy.stats import kurtosis, skew
12
+
13
+ # Suppress unnecessary warnings for cleaner output
14
  warnings.filterwarnings('ignore')
15
 
16
+ # Initialize global variables for model storage
17
  processor = None
18
  whisper_model = None
19
  emotion_tokenizer = None
20
  emotion_model = None
21
 
22
  def load_models():
23
+ """Initialize and load all required machine learning models.
24
+
25
+ This function handles the loading of both the Whisper speech recognition model
26
+ and the emotion detection model. It includes proper error handling and
27
+ device management for optimal performance.
28
+
29
+ Returns:
30
+ bool: True if all models loaded successfully, False otherwise
31
+ """
32
  global processor, whisper_model, emotion_tokenizer, emotion_model
33
 
34
  try:
35
+ # Load the Whisper model for speech recognition
36
  print("Loading Whisper model...")
37
  processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
38
  whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
39
 
40
+ # Load the emotion detection model
41
  print("Loading emotion model...")
42
  emotion_tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
43
  emotion_model = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
44
 
45
+ # Move models to CPU for consistent performance
46
+ device = "cpu"
47
  whisper_model.to(device)
48
  emotion_model.to(device)
49
 
50
  print("Models loaded successfully!")
51
  return True
52
+
53
  except Exception as e:
54
  print(f"Error loading models: {str(e)}")
55
  return False
56
 
 
57
  def extract_prosodic_features(waveform, sr):
58
+ """Extract voice characteristics from audio data with enhanced error handling.
59
+
60
+ This function analyzes the audio waveform to extract various voice features
61
+ including pitch, energy, rhythm, and voice quality metrics. It includes
62
+ robust error handling and validation for each feature.
63
+
64
+ Args:
65
+ waveform (numpy.ndarray): Audio signal
66
+ sr (int): Sampling rate of the audio
67
+
68
+ Returns:
69
+ dict: Dictionary containing extracted features or None if extraction fails
70
+ """
71
  try:
72
+ # Validate input waveform
73
+ if waveform is None or len(waveform) == 0:
74
+ print("Error: Empty or invalid waveform")
75
+ return None
76
+
77
  features = {}
78
+
79
+ # Extract pitch features with enhanced reliability
80
+ try:
81
+ # Configure pitch detection parameters for optimal results
82
+ pitches, magnitudes = librosa.piptrack(
83
+ y=waveform,
84
+ sr=sr,
85
+ fmin=50, # Minimum frequency for human voice
86
+ fmax=2000, # Maximum frequency for human voice
87
+ n_mels=128, # Frequency resolution
88
+ hop_length=512, # Time resolution
89
+ win_length=2048 # Analysis window size
90
+ )
91
+
92
+ # Extract and validate pitch contour
93
+ f0_contour = []
94
+ for t in range(pitches.shape[1]):
95
+ index = magnitudes[:, t].argmax()
96
+ pitch = pitches[index, t]
97
+ if 50 <= pitch <= 2000: # Physiologically valid range
98
+ f0_contour.append(pitch)
99
+
100
+ f0_contour = np.array(f0_contour)
101
+
102
+ # Calculate pitch statistics with validation
103
+ if len(f0_contour) > 0:
104
+ features['pitch_mean'] = float(np.mean(f0_contour))
105
+ features['pitch_std'] = float(np.std(f0_contour))
106
+ features['pitch_range'] = float(np.ptp(f0_contour))
107
+ else:
108
+ # Use default values if no valid pitch detected
109
+ features['pitch_mean'] = 160.0 # Average adult speaking pitch
110
+ features['pitch_std'] = 0.0
111
+ features['pitch_range'] = 0.0
112
+
113
+ except Exception as e:
114
+ print(f"Error in pitch extraction: {str(e)}")
115
+ features['pitch_mean'] = 160.0
116
+ features['pitch_std'] = 0.0
117
+ features['pitch_range'] = 0.0
118
+
119
+ # Extract energy features with noise reduction
120
+ try:
121
+ rms = librosa.feature.rms(
122
+ y=waveform,
123
+ frame_length=2048,
124
+ hop_length=512,
125
+ center=True
126
+ )[0]
127
+
128
+ features['energy_mean'] = float(np.mean(rms))
129
+ features['energy_std'] = float(np.std(rms))
130
+ features['energy_range'] = float(np.ptp(rms))
131
+
132
+ except Exception as e:
133
+ print(f"Error in energy extraction: {str(e)}")
134
+ features['energy_mean'] = 0.02
135
+ features['energy_std'] = 0.0
136
+ features['energy_range'] = 0.0
137
+
138
+ # Extract rhythm features with improved accuracy
139
+ try:
140
+ onset_env = librosa.onset.onset_strength(
141
+ y=waveform,
142
+ sr=sr,
143
+ hop_length=512,
144
+ aggregate=np.median
145
+ )
146
+
147
+ tempo = librosa.beat.tempo(
148
+ onset_envelope=onset_env,
149
+ sr=sr,
150
+ hop_length=512,
151
+ aggregate=None
152
+ )
153
+
154
+ # Validate tempo within normal speech range
155
+ if 40 <= tempo[0] <= 240:
156
+ features['tempo'] = float(tempo[0])
157
+ else:
158
+ features['tempo'] = 120.0 # Default speaking rate
159
+
160
+ except Exception as e:
161
+ print(f"Error in rhythm extraction: {str(e)}")
162
+ features['tempo'] = 120.0
163
+
164
+ # Verify all required features are present
165
+ required_features = [
166
+ 'pitch_mean', 'pitch_std', 'pitch_range',
167
+ 'energy_mean', 'energy_std', 'energy_range',
168
+ 'tempo'
169
+ ]
170
+
171
+ for feature in required_features:
172
+ if feature not in features or not isinstance(features[feature], (int, float)):
173
+ print(f"Warning: Invalid or missing feature: {feature}")
174
+ features[feature] = 0.0
175
+
176
  return features
177
+
178
  except Exception as e:
179
+ print(f"Critical error in extract_prosodic_features: {str(e)}")
180
  return None
181
 
182
  def create_feature_plots(features):
183
+ """Create visualizations for the extracted voice features.
184
+
185
+ This function generates interactive plots showing the various voice
186
+ characteristics including pitch, energy, and rhythm features.
187
+
188
+ Args:
189
+ features (dict): Dictionary containing the extracted voice features
190
+
191
+ Returns:
192
+ str: HTML representation of the plots or None if visualization fails
193
+ """
194
  try:
195
+ fig = go.Figure()
196
+
197
+ # Add pitch feature visualization
198
+ pitch_data = {
199
+ 'Mean': features['pitch_mean'],
200
+ 'Std Dev': features['pitch_std'],
201
+ 'Range': features['pitch_range']
202
+ }
203
+
204
+ fig.add_trace(go.Bar(
205
+ name='Pitch Features (Hz)',
206
+ x=list(pitch_data.keys()),
207
+ y=list(pitch_data.values()),
208
+ marker_color='blue'
209
+ ))
210
+
211
+ # Add energy feature visualization
212
+ energy_data = {
213
+ 'Mean': features['energy_mean'],
214
+ 'Std Dev': features['energy_std'],
215
+ 'Range': features['energy_range']
216
+ }
217
+
218
+ fig.add_trace(go.Bar(
219
+ name='Energy Features',
220
+ x=[f"Energy {k}" for k in energy_data.keys()],
221
+ y=list(energy_data.values()),
222
+ marker_color='red'
223
+ ))
224
+
225
+ # Add tempo indicator
226
+ fig.add_trace(go.Scatter(
227
+ name='Speech Rate (BPM)',
228
+ x=['Tempo'],
229
+ y=[features['tempo']],
230
+ mode='markers',
231
+ marker=dict(size=15, color='green')
232
+ ))
233
+
234
+ # Configure layout for better visualization
235
+ fig.update_layout(
236
+ title='Voice Feature Analysis',
237
+ showlegend=True,
238
+ height=600,
239
+ barmode='group',
240
+ xaxis_title='Feature Type',
241
+ yaxis_title='Value',
242
+ template='plotly_white'
243
+ )
244
+
245
  return fig.to_html(include_plotlyjs=True)
246
+
247
  except Exception as e:
248
  print(f"Error in create_feature_plots: {str(e)}")
249
  return None
250
 
251
  def create_emotion_plot(emotions):
252
+ """Create visualization for emotion analysis results.
253
+
254
+ Args:
255
+ emotions (dict): Dictionary containing emotion scores
256
+
257
+ Returns:
258
+ str: HTML representation of the emotion plot or None if visualization fails
259
+ """
260
  try:
261
+ fig = go.Figure(data=[
262
+ go.Bar(
263
+ x=list(emotions.keys()),
264
+ y=list(emotions.values()),
265
+ marker_color=['#FF9999', '#66B2FF', '#99FF99',
266
+ '#FFCC99', '#FF99CC', '#99FFFF']
267
+ )
268
+ ])
269
+
270
+ fig.update_layout(
271
+ title='Emotion Analysis',
272
+ xaxis_title='Emotion',
273
+ yaxis_title='Confidence Score',
274
+ yaxis_range=[0, 1],
275
+ template='plotly_white',
276
+ height=400
277
+ )
278
+
279
  return fig.to_html(include_plotlyjs=True)
280
  except Exception as e:
281
  print(f"Error in create_emotion_plot: {str(e)}")
282
  return None
283
 
284
  def analyze_audio(audio_input):
285
+ """Main function for analyzing audio input with comprehensive error handling.
286
+
287
+ This function coordinates the entire analysis pipeline including:
288
+ - Audio loading and validation
289
+ - Feature extraction
290
+ - Speech recognition
291
+ - Emotion analysis
292
+ - Visualization generation
293
 
294
+ Args:
295
+ audio_input: Audio file path or tuple containing audio data
296
+
297
+ Returns:
298
+ tuple: (analysis_summary, emotion_visualization, feature_visualization)
299
+ """
300
  try:
301
  if audio_input is None:
302
  return "Please provide an audio input", None, None
303
 
304
+ # Handle audio input and validate format
 
 
305
  if isinstance(audio_input, tuple):
306
  audio_path = audio_input[0]
307
  else:
308
  audio_path = audio_input
309
 
310
+ # Load and validate audio
311
+ waveform, sr = librosa.load(audio_path, sr=16000, duration=30)
312
+ duration = len(waveform) / sr
 
 
313
 
314
+ if duration < 0.5:
315
+ return "Audio too short. Please provide a recording of at least 0.5 seconds.", None, None
316
+
317
  # Extract voice features
 
318
  features = extract_prosodic_features(waveform, sr)
319
  if features is None:
320
+ return "Error extracting voice features. Please try recording again.", None, None
321
 
322
+ # Create visualizations
 
323
  feature_viz = create_feature_plots(features)
324
 
325
+ # Perform speech recognition
 
326
  inputs = processor(waveform, sampling_rate=sr, return_tensors="pt").input_features
 
327
  with torch.no_grad():
328
  predicted_ids = whisper_model.generate(inputs)
329
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
330
 
331
  # Analyze emotions
 
332
  emotion_inputs = emotion_tokenizer(
333
  transcription,
334
  return_tensors="pt",
 
349
 
350
  emotion_viz = create_emotion_plot(emotion_scores)
351
 
352
+ # Generate comprehensive analysis summary
353
  summary = f"""Voice Analysis Summary:
354
 
355
  Speech Content:
 
362
  - Voice Energy: {features['energy_mean']:.4f}
363
 
364
  Dominant Emotion: {max(emotion_scores.items(), key=lambda x: x[1])[0]}
365
+ Emotion Confidence: {max(emotion_scores.values()):.2%}
366
+
367
+ Recording Duration: {duration:.2f} seconds
368
  """
369
 
370
  return summary, emotion_viz, feature_viz
 
379
  print("===== Application Startup =====")
380
  if not load_models():
381
  raise RuntimeError("Failed to load required models")
 
382
 
383
+ # Create Gradio interface with enhanced user guidance
384
  demo = gr.Interface(
385
  fn=analyze_audio,
386
  inputs=gr.Audio(
387
  sources=["microphone", "upload"],
388
  type="filepath",
389
+ label="Audio Input (Recommended: 1-5 seconds of clear speech)"
390
  ),
391
  outputs=[
392
  gr.Textbox(label="Analysis Summary", lines=10),
 
395
  ],
396
  title="Voice Analysis System",
397
  description="""
398
+ This application provides detailed voice analysis through multiple components:
399
 
400
  1. Voice Features:
401
+ - Pitch analysis (fundamental frequency and variation)
402
+ - Energy patterns (volume and intensity)
403
+ - Speech rate (words per minute)
404
+ - Voice quality metrics
405
 
406
  2. Emotional Content:
407
+ - Emotion detection (6 basic emotions)
408
+ - Emotional intensity analysis
409
 
410
  3. Speech Content:
411
+ - Accurate text transcription
412
 
413
+ For optimal results:
414
+ - Record in a quiet environment
415
+ - Speak clearly and naturally
416
+ - Keep recordings between 1-5 seconds
417
+ - Maintain consistent volume
418
+
419
  Upload an audio file or record directly through your microphone.
420
+ """
 
 
421
  )
422
 
 
 
423
  # Launch the interface
424
  if __name__ == "__main__":
 
425
  demo.launch()
426
+
427
  except Exception as e:
428
  print(f"Error during application startup: {str(e)}")
429
  raise