sts

Running

App Files Files Community

lewistape commited on 5 days ago

Commit

5154443

verified ·

1 Parent(s): 3c8e303

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -33

app.py CHANGED Viewed

@@ -17,6 +17,8 @@ from logging.handlers import RotatingFileHandler
 import boto3
 from botocore.exceptions import NoCredentialsError
 import time
 # Import functions from other modules
 from asr import transcribe, ASR_LANGUAGES, ASR_SAMPLING_RATE
@@ -72,22 +74,38 @@ async def get_api_key(api_key_header: str = Security(api_key_header)):
         return api_key_header
     raise HTTPException(status_code=403, detail="Could not validate credentials")
-def load_audio_from_bytes(input_bytes):
-    """Loads audio directly from bytes using soundfile."""
     try:
-        audio_array, sample_rate = sf.read(io.BytesIO(input_bytes))
-        logger.info(f"Successfully read audio with soundfile. Shape: {audio_array.shape}, Sample rate: {sample_rate}")
-        return audio_array, sample_rate
-    except Exception as e:
-        logger.error(f"Could not read audio with soundfile: {str(e)}")
-        raise ValueError(f"Unsupported audio format or corrupted file: {str(e)}")
 @app.post("/transcribe")
 async def transcribe_audio(request: AudioRequest, api_key: APIKey = Depends(get_api_key)):
     start_time = time.time()
     try:
         input_bytes = base64.b64decode(request.audio)
-        audio_array, sample_rate = load_audio_from_bytes(input_bytes)  # Directly load audio
         # Ensure audio_array is float32
         audio_array = audio_array.astype(np.float32)
@@ -126,7 +144,7 @@ async def transcribe_audio_file(
     start_time = time.time()
     try:
         contents = await file.read()
-        audio_array, sample_rate = load_audio_from_bytes(contents)  # Directly load audio
         # Ensure audio_array is float32
         audio_array = audio_array.astype(np.float32)
@@ -187,22 +205,32 @@ async def synthesize_speech(request: TTSRequest, api_key: APIKey = Depends(get_a
         sample_rate, audio = result
         logger.info(f"Synthesis result: sample_rate={sample_rate}, audio_shape={audio.shape if isinstance(audio, np.ndarray) else 'not numpy array'}, audio_dtype={audio.dtype if isinstance(audio, np.ndarray) else type(audio)}")
         audio = np.array(audio, dtype=np.float32)
         max_value = np.max(np.abs(audio))
         if max_value == 0:
             logger.warning("Audio array is all zeros")
             raise ValueError("Generated audio is silent (all zeros)")
         audio = audio / max_value
         audio = (audio * 32767).astype(np.int16)
         buffer = io.BytesIO()
         sf.write(buffer, audio, sample_rate, format='wav')
         buffer.seek(0)
         # Generate a unique filename
         filename = f"synthesized_audio_{int(time.time())}.wav"
-        # Upload to S3 with lifecycle policy
         try:
             s3_client.upload_fileobj(
                 buffer,
@@ -211,26 +239,6 @@ async def synthesize_speech(request: TTSRequest, api_key: APIKey = Depends(get_a
                 ExtraArgs={'ContentType': 'audio/wav'}
             )
             logger.info(f"File uploaded successfully to S3: {filename}")
-            # Apply lifecycle policy to the uploaded file
-            s3_client.put_object_lifecycle_configuration(
-                Bucket=S3_BUCKET,
-                LifecycleConfiguration={
-                    'Rules': [
-                        {
-                            'Expiration': {
-                                'Days': 1
-                            },
-                            'Filter': {
-                                'Prefix': filename  # Apply only to this specific file
-                            },
-                            'ID': 'DeleteSynthesizedAudioAfter1Day',
-                            'Status': 'Enabled'
-                        }
-                    ]
-                }
-            )
-            logger.info("Lifecycle policy applied to delete the file after 1 day.")
             # Generate the public URL with the correct format
             url = f"https://s3.{S3_REGION}.amazonaws.com/{S3_BUCKET}/{filename}"
@@ -271,7 +279,7 @@ async def identify_language(request: AudioRequest, api_key: APIKey = Depends(get
     start_time = time.time()
     try:
         input_bytes = base64.b64decode(request.audio)
-        audio_array, sample_rate = load_audio_from_bytes(input_bytes)  # Directly load audio
         result = identify(audio_array)
         processing_time = time.time() - start_time
         return JSONResponse(content={"language_identification": result, "processing_time_seconds": processing_time})
@@ -295,7 +303,7 @@ async def identify_language_file(
     start_time = time.time()
     try:
         contents = await file.read()
-        audio_array, sample_rate = load_audio_from_bytes(contents) # Directly load audio
         result = identify(audio_array)
         processing_time = time.time() - start_time
         return JSONResponse(content={"language_identification": result, "processing_time_seconds": processing_time})

 import boto3
 from botocore.exceptions import NoCredentialsError
 import time
+import tempfile
+import magic
 # Import functions from other modules
 from asr import transcribe, ASR_LANGUAGES, ASR_SAMPLING_RATE
         return api_key_header
     raise HTTPException(status_code=403, detail="Could not validate credentials")
+def load_audio_file(input_bytes):
+    with tempfile.NamedTemporaryFile(delete=False, suffix='.tmp') as temp_file:
+        temp_file.write(input_bytes)
+        temp_file_path = temp_file.name
     try:
+        # Log file info
+        file_info = magic.from_file(temp_file_path, mime=True)
+        logger.info(f"Received file of type: {file_info}")
+        # Check if the file is an audio file
+        if not file_info.startswith('audio/'):
+            raise ValueError(f"Unsupported file type: {file_info}. Only audio files are supported.")
+        # Try reading with soundfile (handles most audio formats)
+        try:
+            audio_array, sample_rate = sf.read(temp_file_path)
+            logger.info(f"Successfully read audio with soundfile. Shape: {audio_array.shape}, Sample rate: {sample_rate}")
+            return audio_array, sample_rate
+        except Exception as e:
+            logger.error(f"Could not read with soundfile: {str(e)}")
+            raise ValueError(f"Could not read audio file: {str(e)}") from e
+    finally:
+        os.unlink(temp_file_path)
 @app.post("/transcribe")
 async def transcribe_audio(request: AudioRequest, api_key: APIKey = Depends(get_api_key)):
     start_time = time.time()
     try:
         input_bytes = base64.b64decode(request.audio)
+        audio_array, sample_rate = load_audio_file(input_bytes)  # Using load_audio_file
         # Ensure audio_array is float32
         audio_array = audio_array.astype(np.float32)
     start_time = time.time()
     try:
         contents = await file.read()
+        audio_array, sample_rate = load_audio_file(contents)  # Using load_audio_file
         # Ensure audio_array is float32
         audio_array = audio_array.astype(np.float32)
         sample_rate, audio = result
         logger.info(f"Synthesis result: sample_rate={sample_rate}, audio_shape={audio.shape if isinstance(audio, np.ndarray) else 'not numpy array'}, audio_dtype={audio.dtype if isinstance(audio, np.ndarray) else type(audio)}")
+        logger.info("Converting audio to numpy array")
         audio = np.array(audio, dtype=np.float32)
+        logger.info(f"Converted audio shape: {audio.shape}, dtype: {audio.dtype}")
+        logger.info("Normalizing audio")
         max_value = np.max(np.abs(audio))
         if max_value == 0:
             logger.warning("Audio array is all zeros")
             raise ValueError("Generated audio is silent (all zeros)")
         audio = audio / max_value
+        logger.info(f"Normalized audio range: [{audio.min()}, {audio.max()}]")
+        logger.info("Converting to int16")
         audio = (audio * 32767).astype(np.int16)
+        logger.info(f"Int16 audio shape: {audio.shape}, dtype: {audio.dtype}")
+        logger.info("Writing audio to buffer")
         buffer = io.BytesIO()
         sf.write(buffer, audio, sample_rate, format='wav')
         buffer.seek(0)
+        logger.info(f"Buffer size: {buffer.getbuffer().nbytes} bytes")
         # Generate a unique filename
         filename = f"synthesized_audio_{int(time.time())}.wav"
+        # Upload to S3 without ACL
         try:
             s3_client.upload_fileobj(
                 buffer,
                 ExtraArgs={'ContentType': 'audio/wav'}
             )
             logger.info(f"File uploaded successfully to S3: {filename}")
             # Generate the public URL with the correct format
             url = f"https://s3.{S3_REGION}.amazonaws.com/{S3_BUCKET}/{filename}"
     start_time = time.time()
     try:
         input_bytes = base64.b64decode(request.audio)
+        audio_array, sample_rate = load_audio_file(input_bytes)  # Using load_audio_file
         result = identify(audio_array)
         processing_time = time.time() - start_time
         return JSONResponse(content={"language_identification": result, "processing_time_seconds": processing_time})
     start_time = time.time()
     try:
         contents = await file.read()
+        audio_array, sample_rate = load_audio_file(contents) # Using load_audio_file
         result = identify(audio_array)
         processing_time = time.time() - start_time
         return JSONResponse(content={"language_identification": result, "processing_time_seconds": processing_time})