Spaces:

invincible-jha
/

MentalHealthVocalBiomarkers

Runtime error

App Files Files Community

invincible-jha commited on Nov 27, 2024

Commit

81beee5

verified ·

1 Parent(s): 183c824

Upload app.py

Browse files

Files changed (1) hide show

app.py +124 -282

app.py CHANGED Viewed

@@ -9,6 +9,8 @@ import plotly.graph_objects as go
 import warnings
 import os
 from scipy.stats import kurtosis, skew
 # Suppress unnecessary warnings for cleaner output
 warnings.filterwarnings('ignore')
@@ -18,317 +20,139 @@ processor = None
 whisper_model = None
 emotion_tokenizer = None
 emotion_model = None
-def load_models():
-    """Initialize and load all required machine learning models.
-    This function handles the loading of both the Whisper speech recognition model
-    and the emotion detection model. It includes proper error handling and
-    device management for optimal performance.
-    Returns:
-        bool: True if all models loaded successfully, False otherwise
-    """
-    global processor, whisper_model, emotion_tokenizer, emotion_model
-    try:
-        # Load the Whisper model for speech recognition
-        print("Loading Whisper model...")
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
-        whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
-        # Load the emotion detection model
-        print("Loading emotion model...")
-        emotion_tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
-        emotion_model = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
-        # Move models to CPU for consistent performance
-        device = "cpu"
-        whisper_model.to(device)
-        emotion_model.to(device)
-        print("Models loaded successfully!")
-        return True
-    except Exception as e:
-        print(f"Error loading models: {str(e)}")
-        return False
-def extract_prosodic_features(waveform, sr):
-    """Extract voice characteristics from audio data with enhanced error handling.
-    This function analyzes the audio waveform to extract various voice features
-    including pitch, energy, rhythm, and voice quality metrics. It includes
-    robust error handling and validation for each feature.
-    Args:
-        waveform (numpy.ndarray): Audio signal
-        sr (int): Sampling rate of the audio
-    Returns:
-        dict: Dictionary containing extracted features or None if extraction fails
-    """
-    try:
-        # Validate input waveform
-        if waveform is None or len(waveform) == 0:
-            print("Error: Empty or invalid waveform")
-            return None
-        features = {}
-        # Extract pitch features with enhanced reliability
-        try:
-            # Configure pitch detection parameters for optimal results
-            pitches, magnitudes = librosa.piptrack(
-                y=waveform,
-                sr=sr,
-                fmin=50,      # Minimum frequency for human voice
-                fmax=2000,    # Maximum frequency for human voice
-                n_mels=128,   # Frequency resolution
-                hop_length=512,  # Time resolution
-                win_length=2048  # Analysis window size
-            )
-            # Extract and validate pitch contour
-            f0_contour = []
-            for t in range(pitches.shape[1]):
-                index = magnitudes[:, t].argmax()
-                pitch = pitches[index, t]
-                if 50 <= pitch <= 2000:  # Physiologically valid range
-                    f0_contour.append(pitch)
-            f0_contour = np.array(f0_contour)
-            # Calculate pitch statistics with validation
-            if len(f0_contour) > 0:
-                features['pitch_mean'] = float(np.mean(f0_contour))
-                features['pitch_std'] = float(np.std(f0_contour))
-                features['pitch_range'] = float(np.ptp(f0_contour))
-            else:
-                # Use default values if no valid pitch detected
-                features['pitch_mean'] = 160.0  # Average adult speaking pitch
-                features['pitch_std'] = 0.0
-                features['pitch_range'] = 0.0
-        except Exception as e:
-            print(f"Error in pitch extraction: {str(e)}")
-            features['pitch_mean'] = 160.0
-            features['pitch_std'] = 0.0
-            features['pitch_range'] = 0.0
-        # Extract energy features with noise reduction
-        try:
-            rms = librosa.feature.rms(
-                y=waveform,
-                frame_length=2048,
-                hop_length=512,
-                center=True
-            )[0]
-            features['energy_mean'] = float(np.mean(rms))
-            features['energy_std'] = float(np.std(rms))
-            features['energy_range'] = float(np.ptp(rms))
-        except Exception as e:
-            print(f"Error in energy extraction: {str(e)}")
-            features['energy_mean'] = 0.02
-            features['energy_std'] = 0.0
-            features['energy_range'] = 0.0
-        # Extract rhythm features with improved accuracy
         try:
-            onset_env = librosa.onset.onset_strength(
-                y=waveform,
-                sr=sr,
-                hop_length=512,
-                aggregate=np.median
             )
-            tempo = librosa.beat.tempo(
-                onset_envelope=onset_env,
-                sr=sr,
-                hop_length=512,
-                aggregate=None
-            )
-            # Validate tempo within normal speech range
-            if 40 <= tempo[0] <= 240:
-                features['tempo'] = float(tempo[0])
-            else:
-                features['tempo'] = 120.0  # Default speaking rate
         except Exception as e:
-            print(f"Error in rhythm extraction: {str(e)}")
-            features['tempo'] = 120.0
-        # Verify all required features are present
-        required_features = [
-            'pitch_mean', 'pitch_std', 'pitch_range',
-            'energy_mean', 'energy_std', 'energy_range',
-            'tempo'
-        ]
-        for feature in required_features:
-            if feature not in features or not isinstance(features[feature], (int, float)):
-                print(f"Warning: Invalid or missing feature: {feature}")
-                features[feature] = 0.0
-        return features
-    except Exception as e:
-        print(f"Critical error in extract_prosodic_features: {str(e)}")
-        return None
-def create_feature_plots(features):
-    """Create visualizations for the extracted voice features.
-    This function generates interactive plots showing the various voice
-    characteristics including pitch, energy, and rhythm features.
-    Args:
-        features (dict): Dictionary containing the extracted voice features
-    Returns:
-        str: HTML representation of the plots or None if visualization fails
-    """
-    try:
-        fig = go.Figure()
-        # Add pitch feature visualization
-        pitch_data = {
-            'Mean': features['pitch_mean'],
-            'Std Dev': features['pitch_std'],
-            'Range': features['pitch_range']
-        }
-        fig.add_trace(go.Bar(
-            name='Pitch Features (Hz)',
-            x=list(pitch_data.keys()),
-            y=list(pitch_data.values()),
-            marker_color='blue'
-        ))
-        # Add energy feature visualization
-        energy_data = {
-            'Mean': features['energy_mean'],
-            'Std Dev': features['energy_std'],
-            'Range': features['energy_range']
-        }
-        fig.add_trace(go.Bar(
-            name='Energy Features',
-            x=[f"Energy {k}" for k in energy_data.keys()],
-            y=list(energy_data.values()),
-            marker_color='red'
-        ))
-        # Add tempo indicator
-        fig.add_trace(go.Scatter(
-            name='Speech Rate (BPM)',
-            x=['Tempo'],
-            y=[features['tempo']],
-            mode='markers',
-            marker=dict(size=15, color='green')
-        ))
-        # Configure layout for better visualization
-        fig.update_layout(
-            title='Voice Feature Analysis',
-            showlegend=True,
-            height=600,
-            barmode='group',
-            xaxis_title='Feature Type',
-            yaxis_title='Value',
-            template='plotly_white'
-        )
-        return fig.to_html(include_plotlyjs=True)
-    except Exception as e:
-        print(f"Error in create_feature_plots: {str(e)}")
-        return None
-def create_emotion_plot(emotions):
-    """Create visualization for emotion analysis results.
-    Args:
-        emotions (dict): Dictionary containing emotion scores
-    Returns:
-        str: HTML representation of the emotion plot or None if visualization fails
-    """
-    try:
-        fig = go.Figure(data=[
-            go.Bar(
-                x=list(emotions.keys()),
-                y=list(emotions.values()),
-                marker_color=['#FF9999', '#66B2FF', '#99FF99',
-                            '#FFCC99', '#FF99CC', '#99FFFF']
-            )
-        ])
-        fig.update_layout(
-            title='Emotion Analysis',
-            xaxis_title='Emotion',
-            yaxis_title='Confidence Score',
-            yaxis_range=[0, 1],
-            template='plotly_white',
-            height=400
-        )
-        return fig.to_html(include_plotlyjs=True)
-    except Exception as e:
-        print(f"Error in create_emotion_plot: {str(e)}")
-        return None
 def analyze_audio(audio_input):
-    """Main function for analyzing audio input with comprehensive error handling.
-    This function coordinates the entire analysis pipeline including:
-    - Audio loading and validation
-    - Feature extraction
-    - Speech recognition
-    - Emotion analysis
-    - Visualization generation
-    Args:
-        audio_input: Audio file path or tuple containing audio data
-    Returns:
-        tuple: (analysis_summary, emotion_visualization, feature_visualization)
-    """
     try:
         if audio_input is None:
             return "Please provide an audio input", None, None
-        # Handle audio input and validate format
         if isinstance(audio_input, tuple):
             audio_path = audio_input[0]
         else:
             audio_path = audio_input
-        # Load and validate audio
         waveform, sr = librosa.load(audio_path, sr=16000, duration=30)
         duration = len(waveform) / sr
         if duration < 0.5:
             return "Audio too short. Please provide a recording of at least 0.5 seconds.", None, None
-        # Extract voice features
         features = extract_prosodic_features(waveform, sr)
         if features is None:
             return "Error extracting voice features. Please try recording again.", None, None
-        # Create visualizations
         feature_viz = create_feature_plots(features)
-        # Perform speech recognition
         inputs = processor(waveform, sampling_rate=sr, return_tensors="pt").input_features
         with torch.no_grad():
             predicted_ids = whisper_model.generate(inputs)
         transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
-        # Analyze emotions
         emotion_inputs = emotion_tokenizer(
             transcription,
             return_tensors="pt",
@@ -349,7 +173,16 @@ def analyze_audio(audio_input):
         emotion_viz = create_emotion_plot(emotion_scores)
-        # Generate comprehensive analysis summary
         summary = f"""Voice Analysis Summary:
 Speech Content:
@@ -365,6 +198,8 @@ Dominant Emotion: {max(emotion_scores.items(), key=lambda x: x[1])[0]}
 Emotion Confidence: {max(emotion_scores.values()):.2%}
 Recording Duration: {duration:.2f} seconds
 """
         return summary, emotion_viz, feature_viz
@@ -374,13 +209,17 @@ Recording Duration: {duration:.2f} seconds
         print(error_msg)
         return error_msg, None, None
-# Initialize the application
 try:
     print("===== Application Startup =====")
     if not load_models():
         raise RuntimeError("Failed to load required models")
-    # Create Gradio interface with enhanced user guidance
     demo = gr.Interface(
         fn=analyze_audio,
         inputs=gr.Audio(
@@ -389,13 +228,13 @@ try:
             label="Audio Input (Recommended: 1-5 seconds of clear speech)"
         ),
         outputs=[
-            gr.Textbox(label="Analysis Summary", lines=10),
             gr.HTML(label="Emotion Analysis"),
             gr.HTML(label="Voice Feature Analysis")
         ],
-        title="Voice Analysis System",
         description="""
-        This application provides detailed voice analysis through multiple components:
         1. Voice Features:
            - Pitch analysis (fundamental frequency and variation)
@@ -403,13 +242,16 @@ try:
            - Speech rate (words per minute)
            - Voice quality metrics
-        2. Emotional Content:
            - Emotion detection (6 basic emotions)
            - Emotional intensity analysis
-        3. Speech Content:
-           - Accurate text transcription
         For optimal results:
         - Record in a quiet environment
         - Speak clearly and naturally

 import warnings
 import os
 from scipy.stats import kurtosis, skew
+from anthropic import Anthropic
+from typing import Dict, Optional, Tuple
 # Suppress unnecessary warnings for cleaner output
 warnings.filterwarnings('ignore')
 whisper_model = None
 emotion_tokenizer = None
 emotion_model = None
+clinical_analyzer = None
+class ClinicalVoiceAnalyzer:
+    """Clinical voice analysis system using Anthropic's Claude for interpretation."""
+    def __init__(self):
+        """Initialize the clinical analyzer with reference ranges and API client."""
+        self.anthropic = Anthropic(api_key=os.getenv('ANTHROPIC_API_KEY'))
+        self.model = "claude-3-opus-20240229"
+        self.reference_ranges = {
+            'pitch': {'min': 150, 'max': 400},
+            'tempo': {'min': 90, 'max': 130},
+            'energy': {'min': 0.01, 'max': 0.05}
+        }
+        print("Clinical analyzer initialized successfully")
+    def analyze_voice_metrics(self, features: Dict, emotions: Dict, transcription: str) -> str:
+        """Perform comprehensive clinical analysis of voice characteristics.
+        Args:
+            features: Dictionary of extracted voice features
+            emotions: Dictionary of emotion scores
+            transcription: Text transcription of the speech
+        Returns:
+            str: Detailed clinical analysis
+        """
         try:
+            prompt = self._create_analysis_prompt(features, emotions, transcription)
+            response = self.anthropic.messages.create(
+                model=self.model,
+                max_tokens=1000,
+                messages=[{
+                    "role": "user",
+                    "content": prompt
+                }]
             )
+            return self._format_clinical_response(response.content)
         except Exception as e:
+            print(f"Error in clinical analysis: {str(e)}")
+            return self._generate_fallback_analysis(features, emotions)
+    def _create_analysis_prompt(self, features: Dict, emotions: Dict, transcription: str) -> str:
+        """Create a detailed prompt for clinical analysis.
+        Constructs a comprehensive prompt that includes all relevant voice metrics,
+        emotional patterns, and speech content for analysis.
+        """
+        return f"""As a clinical voice analysis expert specializing in mental health assessment,
+        provide a detailed psychological evaluation based on the following data:
+Voice Characteristics:
+- Pitch: {features['pitch_mean']:.2f} Hz (Normal range: {self.reference_ranges['pitch']['min']}-{self.reference_ranges['pitch']['max']} Hz)
+- Pitch Variation: {features['pitch_std']:.2f} Hz
+- Speech Rate: {features['tempo']:.2f} BPM (Normal range: {self.reference_ranges['tempo']['min']}-{self.reference_ranges['tempo']['max']} BPM)
+- Voice Energy: {features['energy_mean']:.4f} (Normal range: {self.reference_ranges['energy']['min']}-{self.reference_ranges['energy']['max']})
+Emotional Analysis:
+{', '.join(f'{emotion}: {score:.1%}' for emotion, score in emotions.items())}
+Speech Content:
+"{transcription}"
+Please provide a comprehensive clinical assessment including:
+1. Analysis of voice characteristics and their psychological significance
+2. Emotional state evaluation and potential underlying patterns
+3. Assessment of anxiety and depression indicators
+4. Stress level evaluation
+5. Clinical recommendations and considerations
+Present the analysis in clear sections with specific observations and clinical insights."""
+    def _format_clinical_response(self, analysis: str) -> str:
+        """Format the clinical analysis for clear presentation."""
+        return f"""
+Clinical Analysis:
+{analysis}
+"""
+    def _generate_fallback_analysis(self, features: Dict, emotions: Dict) -> str:
+        """Generate basic analysis when API is unavailable."""
+        dominant_emotion = max(emotions.items(), key=lambda x: x[1])[0]
+        pitch_status = "elevated" if features['pitch_mean'] > self.reference_ranges['pitch']['max'] else \
+                      "reduced" if features['pitch_mean'] < self.reference_ranges['pitch']['min'] else "normal"
+        return f"""
+Basic Clinical Analysis (API Unavailable):
+Voice Pattern Analysis:
+- Pitch is {pitch_status} ({features['pitch_mean']:.2f} Hz)
+- Speech rate shows {features['tempo']:.2f} BPM
+- Voice energy indicates {features['energy_mean']:.4f} level
+Emotional Indication:
+- Primary emotional tone: {dominant_emotion}
+- Confidence: {max(emotions.values()):.1%}
+Note: This is a basic analysis. For detailed clinical interpretation, please ensure API connectivity.
+"""
+# Your existing functions (load_models, extract_prosodic_features, etc.) remain the same...
 def analyze_audio(audio_input):
+    """Enhanced main function with clinical analysis integration."""
     try:
         if audio_input is None:
             return "Please provide an audio input", None, None
+        # Existing audio processing code...
         if isinstance(audio_input, tuple):
             audio_path = audio_input[0]
         else:
             audio_path = audio_input
         waveform, sr = librosa.load(audio_path, sr=16000, duration=30)
         duration = len(waveform) / sr
         if duration < 0.5:
             return "Audio too short. Please provide a recording of at least 0.5 seconds.", None, None
         features = extract_prosodic_features(waveform, sr)
         if features is None:
             return "Error extracting voice features. Please try recording again.", None, None
         feature_viz = create_feature_plots(features)
+        # Speech recognition
         inputs = processor(waveform, sampling_rate=sr, return_tensors="pt").input_features
         with torch.no_grad():
             predicted_ids = whisper_model.generate(inputs)
         transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+        # Emotion analysis
         emotion_inputs = emotion_tokenizer(
             transcription,
             return_tensors="pt",
         emotion_viz = create_emotion_plot(emotion_scores)
+        # Generate clinical analysis
+        global clinical_analyzer
+        if clinical_analyzer is None:
+            clinical_analyzer = ClinicalVoiceAnalyzer()
+        clinical_analysis = clinical_analyzer.analyze_voice_metrics(
+            features, emotion_scores, transcription
+        )
+        # Create enhanced summary with clinical insights
         summary = f"""Voice Analysis Summary:
 Speech Content:
 Emotion Confidence: {max(emotion_scores.values()):.2%}
 Recording Duration: {duration:.2f} seconds
+{clinical_analysis}
 """
         return summary, emotion_viz, feature_viz
         print(error_msg)
         return error_msg, None, None
+# Initialize the application with clinical analysis capability
 try:
     print("===== Application Startup =====")
     if not load_models():
         raise RuntimeError("Failed to load required models")
+    # Initialize clinical analyzer
+    clinical_analyzer = ClinicalVoiceAnalyzer()
+    print("Clinical analyzer initialized")
+    # Create Gradio interface with enhanced description
     demo = gr.Interface(
         fn=analyze_audio,
         inputs=gr.Audio(
             label="Audio Input (Recommended: 1-5 seconds of clear speech)"
         ),
         outputs=[
+            gr.Textbox(label="Analysis Summary", lines=15),
             gr.HTML(label="Emotion Analysis"),
             gr.HTML(label="Voice Feature Analysis")
         ],
+        title="Advanced Voice Analysis System with Clinical Interpretation",
         description="""
+        This application provides comprehensive voice analysis with clinical interpretation:
         1. Voice Features:
            - Pitch analysis (fundamental frequency and variation)
            - Speech rate (words per minute)
            - Voice quality metrics
+        2. Clinical Analysis:
+           - Mental health indicators
+           - Emotional state evaluation
+           - Risk assessment
+           - Clinical recommendations
+        3. Emotional Content:
            - Emotion detection (6 basic emotions)
            - Emotional intensity analysis
         For optimal results:
         - Record in a quiet environment
         - Speak clearly and naturally