lewistape commited on
Commit
5154443
·
verified ·
1 Parent(s): 3c8e303

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -33
app.py CHANGED
@@ -17,6 +17,8 @@ from logging.handlers import RotatingFileHandler
17
  import boto3
18
  from botocore.exceptions import NoCredentialsError
19
  import time
 
 
20
 
21
  # Import functions from other modules
22
  from asr import transcribe, ASR_LANGUAGES, ASR_SAMPLING_RATE
@@ -72,22 +74,38 @@ async def get_api_key(api_key_header: str = Security(api_key_header)):
72
  return api_key_header
73
  raise HTTPException(status_code=403, detail="Could not validate credentials")
74
 
75
- def load_audio_from_bytes(input_bytes):
76
- """Loads audio directly from bytes using soundfile."""
 
 
 
77
  try:
78
- audio_array, sample_rate = sf.read(io.BytesIO(input_bytes))
79
- logger.info(f"Successfully read audio with soundfile. Shape: {audio_array.shape}, Sample rate: {sample_rate}")
80
- return audio_array, sample_rate
81
- except Exception as e:
82
- logger.error(f"Could not read audio with soundfile: {str(e)}")
83
- raise ValueError(f"Unsupported audio format or corrupted file: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
  @app.post("/transcribe")
86
  async def transcribe_audio(request: AudioRequest, api_key: APIKey = Depends(get_api_key)):
87
  start_time = time.time()
88
  try:
89
  input_bytes = base64.b64decode(request.audio)
90
- audio_array, sample_rate = load_audio_from_bytes(input_bytes) # Directly load audio
91
 
92
  # Ensure audio_array is float32
93
  audio_array = audio_array.astype(np.float32)
@@ -126,7 +144,7 @@ async def transcribe_audio_file(
126
  start_time = time.time()
127
  try:
128
  contents = await file.read()
129
- audio_array, sample_rate = load_audio_from_bytes(contents) # Directly load audio
130
 
131
  # Ensure audio_array is float32
132
  audio_array = audio_array.astype(np.float32)
@@ -187,22 +205,32 @@ async def synthesize_speech(request: TTSRequest, api_key: APIKey = Depends(get_a
187
  sample_rate, audio = result
188
  logger.info(f"Synthesis result: sample_rate={sample_rate}, audio_shape={audio.shape if isinstance(audio, np.ndarray) else 'not numpy array'}, audio_dtype={audio.dtype if isinstance(audio, np.ndarray) else type(audio)}")
189
 
 
190
  audio = np.array(audio, dtype=np.float32)
 
 
 
191
  max_value = np.max(np.abs(audio))
192
  if max_value == 0:
193
  logger.warning("Audio array is all zeros")
194
  raise ValueError("Generated audio is silent (all zeros)")
195
  audio = audio / max_value
 
 
 
196
  audio = (audio * 32767).astype(np.int16)
 
197
 
 
198
  buffer = io.BytesIO()
199
  sf.write(buffer, audio, sample_rate, format='wav')
200
  buffer.seek(0)
 
201
 
202
  # Generate a unique filename
203
  filename = f"synthesized_audio_{int(time.time())}.wav"
204
 
205
- # Upload to S3 with lifecycle policy
206
  try:
207
  s3_client.upload_fileobj(
208
  buffer,
@@ -211,26 +239,6 @@ async def synthesize_speech(request: TTSRequest, api_key: APIKey = Depends(get_a
211
  ExtraArgs={'ContentType': 'audio/wav'}
212
  )
213
  logger.info(f"File uploaded successfully to S3: {filename}")
214
-
215
- # Apply lifecycle policy to the uploaded file
216
- s3_client.put_object_lifecycle_configuration(
217
- Bucket=S3_BUCKET,
218
- LifecycleConfiguration={
219
- 'Rules': [
220
- {
221
- 'Expiration': {
222
- 'Days': 1
223
- },
224
- 'Filter': {
225
- 'Prefix': filename # Apply only to this specific file
226
- },
227
- 'ID': 'DeleteSynthesizedAudioAfter1Day',
228
- 'Status': 'Enabled'
229
- }
230
- ]
231
- }
232
- )
233
- logger.info("Lifecycle policy applied to delete the file after 1 day.")
234
 
235
  # Generate the public URL with the correct format
236
  url = f"https://s3.{S3_REGION}.amazonaws.com/{S3_BUCKET}/{filename}"
@@ -271,7 +279,7 @@ async def identify_language(request: AudioRequest, api_key: APIKey = Depends(get
271
  start_time = time.time()
272
  try:
273
  input_bytes = base64.b64decode(request.audio)
274
- audio_array, sample_rate = load_audio_from_bytes(input_bytes) # Directly load audio
275
  result = identify(audio_array)
276
  processing_time = time.time() - start_time
277
  return JSONResponse(content={"language_identification": result, "processing_time_seconds": processing_time})
@@ -295,7 +303,7 @@ async def identify_language_file(
295
  start_time = time.time()
296
  try:
297
  contents = await file.read()
298
- audio_array, sample_rate = load_audio_from_bytes(contents) # Directly load audio
299
  result = identify(audio_array)
300
  processing_time = time.time() - start_time
301
  return JSONResponse(content={"language_identification": result, "processing_time_seconds": processing_time})
 
17
  import boto3
18
  from botocore.exceptions import NoCredentialsError
19
  import time
20
+ import tempfile
21
+ import magic
22
 
23
  # Import functions from other modules
24
  from asr import transcribe, ASR_LANGUAGES, ASR_SAMPLING_RATE
 
74
  return api_key_header
75
  raise HTTPException(status_code=403, detail="Could not validate credentials")
76
 
77
+ def load_audio_file(input_bytes):
78
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.tmp') as temp_file:
79
+ temp_file.write(input_bytes)
80
+ temp_file_path = temp_file.name
81
+
82
  try:
83
+ # Log file info
84
+ file_info = magic.from_file(temp_file_path, mime=True)
85
+ logger.info(f"Received file of type: {file_info}")
86
+
87
+ # Check if the file is an audio file
88
+ if not file_info.startswith('audio/'):
89
+ raise ValueError(f"Unsupported file type: {file_info}. Only audio files are supported.")
90
+
91
+ # Try reading with soundfile (handles most audio formats)
92
+ try:
93
+ audio_array, sample_rate = sf.read(temp_file_path)
94
+ logger.info(f"Successfully read audio with soundfile. Shape: {audio_array.shape}, Sample rate: {sample_rate}")
95
+ return audio_array, sample_rate
96
+ except Exception as e:
97
+ logger.error(f"Could not read with soundfile: {str(e)}")
98
+ raise ValueError(f"Could not read audio file: {str(e)}") from e
99
+
100
+ finally:
101
+ os.unlink(temp_file_path)
102
 
103
  @app.post("/transcribe")
104
  async def transcribe_audio(request: AudioRequest, api_key: APIKey = Depends(get_api_key)):
105
  start_time = time.time()
106
  try:
107
  input_bytes = base64.b64decode(request.audio)
108
+ audio_array, sample_rate = load_audio_file(input_bytes) # Using load_audio_file
109
 
110
  # Ensure audio_array is float32
111
  audio_array = audio_array.astype(np.float32)
 
144
  start_time = time.time()
145
  try:
146
  contents = await file.read()
147
+ audio_array, sample_rate = load_audio_file(contents) # Using load_audio_file
148
 
149
  # Ensure audio_array is float32
150
  audio_array = audio_array.astype(np.float32)
 
205
  sample_rate, audio = result
206
  logger.info(f"Synthesis result: sample_rate={sample_rate}, audio_shape={audio.shape if isinstance(audio, np.ndarray) else 'not numpy array'}, audio_dtype={audio.dtype if isinstance(audio, np.ndarray) else type(audio)}")
207
 
208
+ logger.info("Converting audio to numpy array")
209
  audio = np.array(audio, dtype=np.float32)
210
+ logger.info(f"Converted audio shape: {audio.shape}, dtype: {audio.dtype}")
211
+
212
+ logger.info("Normalizing audio")
213
  max_value = np.max(np.abs(audio))
214
  if max_value == 0:
215
  logger.warning("Audio array is all zeros")
216
  raise ValueError("Generated audio is silent (all zeros)")
217
  audio = audio / max_value
218
+ logger.info(f"Normalized audio range: [{audio.min()}, {audio.max()}]")
219
+
220
+ logger.info("Converting to int16")
221
  audio = (audio * 32767).astype(np.int16)
222
+ logger.info(f"Int16 audio shape: {audio.shape}, dtype: {audio.dtype}")
223
 
224
+ logger.info("Writing audio to buffer")
225
  buffer = io.BytesIO()
226
  sf.write(buffer, audio, sample_rate, format='wav')
227
  buffer.seek(0)
228
+ logger.info(f"Buffer size: {buffer.getbuffer().nbytes} bytes")
229
 
230
  # Generate a unique filename
231
  filename = f"synthesized_audio_{int(time.time())}.wav"
232
 
233
+ # Upload to S3 without ACL
234
  try:
235
  s3_client.upload_fileobj(
236
  buffer,
 
239
  ExtraArgs={'ContentType': 'audio/wav'}
240
  )
241
  logger.info(f"File uploaded successfully to S3: {filename}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
 
243
  # Generate the public URL with the correct format
244
  url = f"https://s3.{S3_REGION}.amazonaws.com/{S3_BUCKET}/{filename}"
 
279
  start_time = time.time()
280
  try:
281
  input_bytes = base64.b64decode(request.audio)
282
+ audio_array, sample_rate = load_audio_file(input_bytes) # Using load_audio_file
283
  result = identify(audio_array)
284
  processing_time = time.time() - start_time
285
  return JSONResponse(content={"language_identification": result, "processing_time_seconds": processing_time})
 
303
  start_time = time.time()
304
  try:
305
  contents = await file.read()
306
+ audio_array, sample_rate = load_audio_file(contents) # Using load_audio_file
307
  result = identify(audio_array)
308
  processing_time = time.time() - start_time
309
  return JSONResponse(content={"language_identification": result, "processing_time_seconds": processing_time})