invincible-jha
commited on
Upload 2 files
Browse files- app.py +79 -257
- requirements.txt +12 -1
app.py
CHANGED
@@ -1,239 +1,82 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
import torch
|
3 |
-
from transformers import WhisperProcessor, WhisperForConditionalGeneration, AutoModelForSequenceClassification, AutoTokenizer
|
4 |
-
import librosa
|
5 |
-
import numpy as np
|
6 |
-
import plotly.graph_objects as go
|
7 |
-
import warnings
|
8 |
import os
|
9 |
-
from
|
10 |
-
|
11 |
-
|
12 |
-
def extract_prosodic_features(waveform, sr):
|
13 |
-
"""Extract prosodic features from audio"""
|
14 |
-
try:
|
15 |
-
features = {}
|
16 |
-
|
17 |
-
# 1. Pitch (F0) Features
|
18 |
-
pitches, magnitudes = librosa.piptrack(y=waveform, sr=sr)
|
19 |
-
f0_contour = []
|
20 |
-
for t in range(pitches.shape[1]):
|
21 |
-
pitches_at_t = pitches[:, t]
|
22 |
-
mags = magnitudes[:, t]
|
23 |
-
pitch_index = mags.argmax()
|
24 |
-
f0_contour.append(pitches[pitch_index, t])
|
25 |
-
f0_contour = np.array(f0_contour)
|
26 |
-
f0_contour = f0_contour[f0_contour > 0] # Remove zero pitches
|
27 |
-
|
28 |
-
if len(f0_contour) > 0:
|
29 |
-
features['pitch_mean'] = np.mean(f0_contour)
|
30 |
-
features['pitch_std'] = np.std(f0_contour)
|
31 |
-
features['pitch_range'] = np.ptp(f0_contour)
|
32 |
-
else:
|
33 |
-
features['pitch_mean'] = 0
|
34 |
-
features['pitch_std'] = 0
|
35 |
-
features['pitch_range'] = 0
|
36 |
-
|
37 |
-
# 2. Energy/Intensity Features
|
38 |
-
rms = librosa.feature.rms(y=waveform)[0]
|
39 |
-
features['energy_mean'] = np.mean(rms)
|
40 |
-
features['energy_std'] = np.std(rms)
|
41 |
-
features['energy_range'] = np.ptp(rms)
|
42 |
-
|
43 |
-
# 3. Rhythm Features
|
44 |
-
onset_env = librosa.onset.onset_strength(y=waveform, sr=sr)
|
45 |
-
tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sr)
|
46 |
-
features['tempo'] = tempo[0]
|
47 |
-
|
48 |
-
# 4. Voice Quality Features
|
49 |
-
spectral_centroids = librosa.feature.spectral_centroid(y=waveform, sr=sr)[0]
|
50 |
-
features['spectral_centroid_mean'] = np.mean(spectral_centroids)
|
51 |
-
|
52 |
-
spectral_rolloff = librosa.feature.spectral_rolloff(y=waveform, sr=sr)[0]
|
53 |
-
features['spectral_rolloff_mean'] = np.mean(spectral_rolloff)
|
54 |
-
|
55 |
-
# 5. MFCC Features
|
56 |
-
mfccs = librosa.feature.mfcc(y=waveform, sr=sr, n_mfcc=13)
|
57 |
-
for i in range(13):
|
58 |
-
features[f'mfcc_{i}_mean'] = np.mean(mfccs[i])
|
59 |
-
features[f'mfcc_{i}_std'] = np.std(mfccs[i])
|
60 |
-
|
61 |
-
return features
|
62 |
-
|
63 |
-
except Exception as e:
|
64 |
-
print(f"Error in extract_prosodic_features: {str(e)}")
|
65 |
-
return None
|
66 |
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
'
|
77 |
-
'
|
78 |
-
|
79 |
-
|
80 |
-
fig.add_trace(go.Bar(
|
81 |
-
name='Pitch Features',
|
82 |
-
x=list(pitch_data.keys()),
|
83 |
-
y=list(pitch_data.values()),
|
84 |
-
marker_color='blue'
|
85 |
-
))
|
86 |
-
|
87 |
-
# 2. Energy Features
|
88 |
-
energy_data = {
|
89 |
-
'Mean': features['energy_mean'],
|
90 |
-
'Std Dev': features['energy_std'],
|
91 |
-
'Range': features['energy_range']
|
92 |
}
|
93 |
|
94 |
-
|
95 |
-
|
96 |
-
x=[f"Energy {k}" for k in energy_data.keys()],
|
97 |
-
y=list(energy_data.values()),
|
98 |
-
marker_color='red'
|
99 |
-
))
|
100 |
-
|
101 |
-
# 3. MFCC Plot
|
102 |
-
mfcc_means = [features[f'mfcc_{i}_mean'] for i in range(13)]
|
103 |
-
fig.add_trace(go.Scatter(
|
104 |
-
name='MFCC Coefficients',
|
105 |
-
y=mfcc_means,
|
106 |
-
mode='lines+markers',
|
107 |
-
marker_color='green'
|
108 |
-
))
|
109 |
-
|
110 |
-
# Update layout
|
111 |
-
fig.update_layout(
|
112 |
-
title='Voice Feature Analysis',
|
113 |
-
showlegend=True,
|
114 |
-
height=600,
|
115 |
-
barmode='group'
|
116 |
-
)
|
117 |
-
|
118 |
-
return fig.to_html(include_plotlyjs=True)
|
119 |
-
|
120 |
-
except Exception as e:
|
121 |
-
print(f"Error in create_feature_plots: {str(e)}")
|
122 |
-
return None
|
123 |
-
|
124 |
-
def load_models():
|
125 |
-
"""Initialize and load all required models"""
|
126 |
-
global processor, whisper_model, emotion_tokenizer, emotion_model
|
127 |
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
print("Models loaded successfully!")
|
141 |
-
return True
|
142 |
-
except Exception as e:
|
143 |
-
print(f"Error loading models: {str(e)}")
|
144 |
-
return False
|
145 |
|
146 |
-
def
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
)
|
155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
|
157 |
-
|
158 |
-
title='Emotion Analysis',
|
159 |
-
xaxis_title='Emotion',
|
160 |
-
yaxis_title='Score',
|
161 |
-
yaxis_range=[0, 1],
|
162 |
-
template='plotly_white',
|
163 |
-
height=400
|
164 |
-
)
|
165 |
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
|
|
170 |
|
|
|
|
|
|
|
171 |
def analyze_audio(audio_input):
|
172 |
-
"""Main function to analyze audio input"""
|
173 |
try:
|
174 |
-
|
175 |
-
return "Please provide an audio input", None, None
|
176 |
-
|
177 |
-
print(f"Processing audio input: {type(audio_input)}")
|
178 |
-
|
179 |
-
# Handle audio input
|
180 |
-
if isinstance(audio_input, tuple):
|
181 |
-
audio_path = audio_input[0] # Get file path from tuple
|
182 |
-
else:
|
183 |
-
audio_path = audio_input
|
184 |
-
|
185 |
-
print(f"Loading audio from path: {audio_path}")
|
186 |
|
187 |
-
#
|
188 |
-
|
189 |
-
|
190 |
|
191 |
-
#
|
192 |
-
print("Extracting voice features...")
|
193 |
-
features = extract_prosodic_features(waveform, sr)
|
194 |
-
if features is None:
|
195 |
-
return "Error extracting voice features", None, None
|
196 |
-
|
197 |
-
# Create feature plots
|
198 |
-
print("Creating feature visualizations...")
|
199 |
-
feature_viz = create_feature_plots(features)
|
200 |
-
if feature_viz is None:
|
201 |
-
return "Error creating feature visualizations", None, None
|
202 |
-
|
203 |
-
# Transcribe audio
|
204 |
-
print("Transcribing audio...")
|
205 |
-
inputs = processor(waveform, sampling_rate=sr, return_tensors="pt").input_features
|
206 |
-
|
207 |
-
with torch.no_grad():
|
208 |
-
predicted_ids = whisper_model.generate(inputs)
|
209 |
-
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
210 |
-
|
211 |
-
# Analyze emotions
|
212 |
-
print("Analyzing emotions...")
|
213 |
-
emotion_inputs = emotion_tokenizer(
|
214 |
-
transcription,
|
215 |
-
return_tensors="pt",
|
216 |
-
padding=True,
|
217 |
-
truncation=True,
|
218 |
-
max_length=512
|
219 |
-
)
|
220 |
-
|
221 |
-
with torch.no_grad():
|
222 |
-
emotion_outputs = emotion_model(**emotion_inputs)
|
223 |
-
emotions = torch.nn.functional.softmax(emotion_outputs.logits, dim=-1)
|
224 |
-
|
225 |
-
emotion_labels = ['anger', 'fear', 'joy', 'neutral', 'sadness', 'surprise']
|
226 |
-
emotion_scores = {
|
227 |
-
label: float(score)
|
228 |
-
for label, score in zip(emotion_labels, emotions[0].cpu().numpy())
|
229 |
-
}
|
230 |
-
|
231 |
-
# Create emotion visualization
|
232 |
-
emotion_viz = create_emotion_plot(emotion_scores)
|
233 |
-
if emotion_viz is None:
|
234 |
-
return "Error creating emotion visualization", None, None
|
235 |
-
|
236 |
-
# Create analysis summary
|
237 |
summary = f"""Voice Analysis Summary:
|
238 |
|
239 |
Speech Content:
|
@@ -246,21 +89,20 @@ Voice Characteristics:
|
|
246 |
- Voice Energy: {features['energy_mean']:.4f}
|
247 |
|
248 |
Dominant Emotion: {max(emotion_scores.items(), key=lambda x: x[1])[0]}
|
|
|
|
|
|
|
249 |
"""
|
250 |
-
|
251 |
-
return summary, emotion_viz, feature_viz
|
252 |
|
253 |
except Exception as e:
|
254 |
error_msg = f"Error in audio analysis: {str(e)}"
|
255 |
print(error_msg)
|
256 |
-
return error_msg, None, None
|
257 |
|
258 |
-
#
|
259 |
-
print("Initializing application...")
|
260 |
-
if not load_models():
|
261 |
-
raise RuntimeError("Failed to load required models")
|
262 |
|
263 |
-
#
|
264 |
demo = gr.Interface(
|
265 |
fn=analyze_audio,
|
266 |
inputs=gr.Audio(
|
@@ -269,32 +111,12 @@ demo = gr.Interface(
|
|
269 |
label="Audio Input"
|
270 |
),
|
271 |
outputs=[
|
272 |
-
gr.Textbox(label="Analysis Summary", lines=
|
273 |
gr.HTML(label="Emotion Analysis"),
|
274 |
-
gr.HTML(label="Voice Feature Analysis")
|
|
|
275 |
],
|
276 |
-
title="Voice Analysis System",
|
277 |
-
description="""
|
278 |
-
|
279 |
-
|
280 |
-
1. Voice Features:
|
281 |
-
- Pitch analysis
|
282 |
-
- Energy patterns
|
283 |
-
- Speech rate
|
284 |
-
- Voice quality
|
285 |
-
|
286 |
-
2. Emotional Content:
|
287 |
-
- Emotion detection
|
288 |
-
- Emotional intensity
|
289 |
-
|
290 |
-
3. Speech Content:
|
291 |
-
- Text transcription
|
292 |
-
|
293 |
-
Upload an audio file or record directly through your microphone.
|
294 |
-
""",
|
295 |
-
examples=None,
|
296 |
-
cache_examples=False
|
297 |
-
)
|
298 |
-
|
299 |
-
if __name__ == "__main__":
|
300 |
-
demo.launch(share=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
+
from anthropic import Anthropic
|
3 |
+
import gradio as gr
|
4 |
+
# ... (your existing imports)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
+
class ClinicalVoiceAnalyzer:
|
7 |
+
def __init__(self):
|
8 |
+
# Initialize without the API key first
|
9 |
+
self.anthropic = None
|
10 |
+
self.model = "claude-3-opus-20240229"
|
11 |
+
self.api_key = os.getenv('ANTHROPIC_API_KEY')
|
12 |
+
|
13 |
+
# Reference ranges remain the same
|
14 |
+
self.reference_ranges = {
|
15 |
+
'pitch': {'min': 150, 'max': 400},
|
16 |
+
'tempo': {'min': 90, 'max': 130},
|
17 |
+
'energy': {'min': 0.01, 'max': 0.05}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
}
|
19 |
|
20 |
+
# Initialize Anthropic client if API key is available
|
21 |
+
self._initialize_anthropic()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
+
def _initialize_anthropic(self):
|
24 |
+
"""Safely initialize the Anthropic client"""
|
25 |
+
try:
|
26 |
+
if self.api_key:
|
27 |
+
self.anthropic = Anthropic(api_key=self.api_key)
|
28 |
+
print("Anthropic client initialized successfully")
|
29 |
+
else:
|
30 |
+
print("Warning: ANTHROPIC_API_KEY not found in environment variables")
|
31 |
+
except Exception as e:
|
32 |
+
print(f"Error initializing Anthropic client: {str(e)}")
|
33 |
+
self.anthropic = None
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
+
def generate_clinical_analysis(self, voice_features):
|
36 |
+
"""Generate clinical analysis with fallback behavior"""
|
37 |
+
if not self.anthropic:
|
38 |
+
return self._generate_fallback_analysis(voice_features), {}
|
39 |
+
|
40 |
+
try:
|
41 |
+
prompt = self._construct_analysis_prompt(voice_features)
|
42 |
+
response = self.anthropic.messages.create(
|
43 |
+
model=self.model,
|
44 |
+
max_tokens=1000,
|
45 |
+
messages=[{
|
46 |
+
"role": "user",
|
47 |
+
"content": prompt
|
48 |
+
}]
|
49 |
)
|
50 |
+
return response.content, self._parse_clinical_response(response.content)
|
51 |
+
except Exception as e:
|
52 |
+
print(f"Error in clinical analysis: {str(e)}")
|
53 |
+
return self._generate_fallback_analysis(voice_features), {}
|
54 |
+
|
55 |
+
def _generate_fallback_analysis(self, features):
|
56 |
+
"""Generate basic analysis when Anthropic API is unavailable"""
|
57 |
+
pitch_status = "elevated" if features['pitch_mean'] > self.reference_ranges['pitch']['max'] else "normal"
|
58 |
+
tempo_status = "elevated" if features['tempo'] > self.reference_ranges['tempo']['max'] else "normal"
|
59 |
|
60 |
+
return f"""Basic Voice Analysis:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
+
Pitch Analysis: {pitch_status} ({features['pitch_mean']:.2f} Hz)
|
63 |
+
Speech Rate: {tempo_status} ({features['tempo']:.2f} BPM)
|
64 |
+
Energy Level: {features['energy_mean']:.4f}
|
65 |
+
|
66 |
+
Note: This is a basic analysis. For detailed clinical interpretation, please ensure the Anthropic API key is configured."""
|
67 |
|
68 |
+
# ... (rest of your ClinicalVoiceAnalyzer methods remain the same)
|
69 |
+
|
70 |
+
# Modified analyze_audio function
|
71 |
def analyze_audio(audio_input):
|
|
|
72 |
try:
|
73 |
+
# Your existing audio processing code...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
+
# Initialize clinical analyzer with graceful fallback
|
76 |
+
clinical_analyzer = ClinicalVoiceAnalyzer()
|
77 |
+
clinical_analysis, clinical_insights = clinical_analyzer.generate_clinical_analysis(features)
|
78 |
|
79 |
+
# Create enhanced summary
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
summary = f"""Voice Analysis Summary:
|
81 |
|
82 |
Speech Content:
|
|
|
89 |
- Voice Energy: {features['energy_mean']:.4f}
|
90 |
|
91 |
Dominant Emotion: {max(emotion_scores.items(), key=lambda x: x[1])[0]}
|
92 |
+
|
93 |
+
Clinical Analysis:
|
94 |
+
{clinical_analysis}
|
95 |
"""
|
96 |
+
return summary, emotion_viz, feature_viz, clinical_insights
|
|
|
97 |
|
98 |
except Exception as e:
|
99 |
error_msg = f"Error in audio analysis: {str(e)}"
|
100 |
print(error_msg)
|
101 |
+
return error_msg, None, None, None
|
102 |
|
103 |
+
# ... (rest of your existing code)
|
|
|
|
|
|
|
104 |
|
105 |
+
# Modified Gradio interface
|
106 |
demo = gr.Interface(
|
107 |
fn=analyze_audio,
|
108 |
inputs=gr.Audio(
|
|
|
111 |
label="Audio Input"
|
112 |
),
|
113 |
outputs=[
|
114 |
+
gr.Textbox(label="Analysis Summary", lines=15),
|
115 |
gr.HTML(label="Emotion Analysis"),
|
116 |
+
gr.HTML(label="Voice Feature Analysis"),
|
117 |
+
gr.JSON(label="Clinical Insights")
|
118 |
],
|
119 |
+
title="Advanced Voice Analysis System",
|
120 |
+
description="""This system provides comprehensive voice analysis with clinical interpretation.
|
121 |
+
Upload an audio file or record directly through your microphone."""
|
122 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
gradio==3.50.2
|
2 |
torch==2.1.0
|
3 |
transformers==4.35.2
|
@@ -5,4 +6,14 @@ librosa==0.10.1
|
|
5 |
numpy==1.24.3
|
6 |
plotly==5.18.0
|
7 |
soundfile==0.12.1
|
8 |
-
scipy==1.11.3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Core dependencies with existing versions
|
2 |
gradio==3.50.2
|
3 |
torch==2.1.0
|
4 |
transformers==4.35.2
|
|
|
6 |
numpy==1.24.3
|
7 |
plotly==5.18.0
|
8 |
soundfile==0.12.1
|
9 |
+
scipy==1.11.3
|
10 |
+
|
11 |
+
# New dependencies for Anthropic integration
|
12 |
+
anthropic==0.3.11
|
13 |
+
python-dotenv==1.0.0
|
14 |
+
requests>=2.31.0
|
15 |
+
|
16 |
+
# Additional utilities that enhance stability
|
17 |
+
tqdm>=4.66.1
|
18 |
+
regex>=2023.8.8
|
19 |
+
tenacity>=8.2.3
|