Update app.py
Browse files
app.py
CHANGED
@@ -17,6 +17,8 @@ from logging.handlers import RotatingFileHandler
|
|
17 |
import boto3
|
18 |
from botocore.exceptions import NoCredentialsError
|
19 |
import time
|
|
|
|
|
20 |
|
21 |
# Import functions from other modules
|
22 |
from asr import transcribe, ASR_LANGUAGES, ASR_SAMPLING_RATE
|
@@ -72,22 +74,38 @@ async def get_api_key(api_key_header: str = Security(api_key_header)):
|
|
72 |
return api_key_header
|
73 |
raise HTTPException(status_code=403, detail="Could not validate credentials")
|
74 |
|
75 |
-
def
|
76 |
-
|
|
|
|
|
|
|
77 |
try:
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
@app.post("/transcribe")
|
86 |
async def transcribe_audio(request: AudioRequest, api_key: APIKey = Depends(get_api_key)):
|
87 |
start_time = time.time()
|
88 |
try:
|
89 |
input_bytes = base64.b64decode(request.audio)
|
90 |
-
audio_array, sample_rate =
|
91 |
|
92 |
# Ensure audio_array is float32
|
93 |
audio_array = audio_array.astype(np.float32)
|
@@ -126,7 +144,7 @@ async def transcribe_audio_file(
|
|
126 |
start_time = time.time()
|
127 |
try:
|
128 |
contents = await file.read()
|
129 |
-
audio_array, sample_rate =
|
130 |
|
131 |
# Ensure audio_array is float32
|
132 |
audio_array = audio_array.astype(np.float32)
|
@@ -187,22 +205,32 @@ async def synthesize_speech(request: TTSRequest, api_key: APIKey = Depends(get_a
|
|
187 |
sample_rate, audio = result
|
188 |
logger.info(f"Synthesis result: sample_rate={sample_rate}, audio_shape={audio.shape if isinstance(audio, np.ndarray) else 'not numpy array'}, audio_dtype={audio.dtype if isinstance(audio, np.ndarray) else type(audio)}")
|
189 |
|
|
|
190 |
audio = np.array(audio, dtype=np.float32)
|
|
|
|
|
|
|
191 |
max_value = np.max(np.abs(audio))
|
192 |
if max_value == 0:
|
193 |
logger.warning("Audio array is all zeros")
|
194 |
raise ValueError("Generated audio is silent (all zeros)")
|
195 |
audio = audio / max_value
|
|
|
|
|
|
|
196 |
audio = (audio * 32767).astype(np.int16)
|
|
|
197 |
|
|
|
198 |
buffer = io.BytesIO()
|
199 |
sf.write(buffer, audio, sample_rate, format='wav')
|
200 |
buffer.seek(0)
|
|
|
201 |
|
202 |
# Generate a unique filename
|
203 |
filename = f"synthesized_audio_{int(time.time())}.wav"
|
204 |
|
205 |
-
# Upload to S3
|
206 |
try:
|
207 |
s3_client.upload_fileobj(
|
208 |
buffer,
|
@@ -211,26 +239,6 @@ async def synthesize_speech(request: TTSRequest, api_key: APIKey = Depends(get_a
|
|
211 |
ExtraArgs={'ContentType': 'audio/wav'}
|
212 |
)
|
213 |
logger.info(f"File uploaded successfully to S3: {filename}")
|
214 |
-
|
215 |
-
# Apply lifecycle policy to the uploaded file
|
216 |
-
s3_client.put_object_lifecycle_configuration(
|
217 |
-
Bucket=S3_BUCKET,
|
218 |
-
LifecycleConfiguration={
|
219 |
-
'Rules': [
|
220 |
-
{
|
221 |
-
'Expiration': {
|
222 |
-
'Days': 1
|
223 |
-
},
|
224 |
-
'Filter': {
|
225 |
-
'Prefix': filename # Apply only to this specific file
|
226 |
-
},
|
227 |
-
'ID': 'DeleteSynthesizedAudioAfter1Day',
|
228 |
-
'Status': 'Enabled'
|
229 |
-
}
|
230 |
-
]
|
231 |
-
}
|
232 |
-
)
|
233 |
-
logger.info("Lifecycle policy applied to delete the file after 1 day.")
|
234 |
|
235 |
# Generate the public URL with the correct format
|
236 |
url = f"https://s3.{S3_REGION}.amazonaws.com/{S3_BUCKET}/{filename}"
|
@@ -271,7 +279,7 @@ async def identify_language(request: AudioRequest, api_key: APIKey = Depends(get
|
|
271 |
start_time = time.time()
|
272 |
try:
|
273 |
input_bytes = base64.b64decode(request.audio)
|
274 |
-
audio_array, sample_rate =
|
275 |
result = identify(audio_array)
|
276 |
processing_time = time.time() - start_time
|
277 |
return JSONResponse(content={"language_identification": result, "processing_time_seconds": processing_time})
|
@@ -295,7 +303,7 @@ async def identify_language_file(
|
|
295 |
start_time = time.time()
|
296 |
try:
|
297 |
contents = await file.read()
|
298 |
-
audio_array, sample_rate =
|
299 |
result = identify(audio_array)
|
300 |
processing_time = time.time() - start_time
|
301 |
return JSONResponse(content={"language_identification": result, "processing_time_seconds": processing_time})
|
|
|
17 |
import boto3
|
18 |
from botocore.exceptions import NoCredentialsError
|
19 |
import time
|
20 |
+
import tempfile
|
21 |
+
import magic
|
22 |
|
23 |
# Import functions from other modules
|
24 |
from asr import transcribe, ASR_LANGUAGES, ASR_SAMPLING_RATE
|
|
|
74 |
return api_key_header
|
75 |
raise HTTPException(status_code=403, detail="Could not validate credentials")
|
76 |
|
77 |
+
def load_audio_file(input_bytes):
|
78 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.tmp') as temp_file:
|
79 |
+
temp_file.write(input_bytes)
|
80 |
+
temp_file_path = temp_file.name
|
81 |
+
|
82 |
try:
|
83 |
+
# Log file info
|
84 |
+
file_info = magic.from_file(temp_file_path, mime=True)
|
85 |
+
logger.info(f"Received file of type: {file_info}")
|
86 |
+
|
87 |
+
# Check if the file is an audio file
|
88 |
+
if not file_info.startswith('audio/'):
|
89 |
+
raise ValueError(f"Unsupported file type: {file_info}. Only audio files are supported.")
|
90 |
+
|
91 |
+
# Try reading with soundfile (handles most audio formats)
|
92 |
+
try:
|
93 |
+
audio_array, sample_rate = sf.read(temp_file_path)
|
94 |
+
logger.info(f"Successfully read audio with soundfile. Shape: {audio_array.shape}, Sample rate: {sample_rate}")
|
95 |
+
return audio_array, sample_rate
|
96 |
+
except Exception as e:
|
97 |
+
logger.error(f"Could not read with soundfile: {str(e)}")
|
98 |
+
raise ValueError(f"Could not read audio file: {str(e)}") from e
|
99 |
+
|
100 |
+
finally:
|
101 |
+
os.unlink(temp_file_path)
|
102 |
|
103 |
@app.post("/transcribe")
|
104 |
async def transcribe_audio(request: AudioRequest, api_key: APIKey = Depends(get_api_key)):
|
105 |
start_time = time.time()
|
106 |
try:
|
107 |
input_bytes = base64.b64decode(request.audio)
|
108 |
+
audio_array, sample_rate = load_audio_file(input_bytes) # Using load_audio_file
|
109 |
|
110 |
# Ensure audio_array is float32
|
111 |
audio_array = audio_array.astype(np.float32)
|
|
|
144 |
start_time = time.time()
|
145 |
try:
|
146 |
contents = await file.read()
|
147 |
+
audio_array, sample_rate = load_audio_file(contents) # Using load_audio_file
|
148 |
|
149 |
# Ensure audio_array is float32
|
150 |
audio_array = audio_array.astype(np.float32)
|
|
|
205 |
sample_rate, audio = result
|
206 |
logger.info(f"Synthesis result: sample_rate={sample_rate}, audio_shape={audio.shape if isinstance(audio, np.ndarray) else 'not numpy array'}, audio_dtype={audio.dtype if isinstance(audio, np.ndarray) else type(audio)}")
|
207 |
|
208 |
+
logger.info("Converting audio to numpy array")
|
209 |
audio = np.array(audio, dtype=np.float32)
|
210 |
+
logger.info(f"Converted audio shape: {audio.shape}, dtype: {audio.dtype}")
|
211 |
+
|
212 |
+
logger.info("Normalizing audio")
|
213 |
max_value = np.max(np.abs(audio))
|
214 |
if max_value == 0:
|
215 |
logger.warning("Audio array is all zeros")
|
216 |
raise ValueError("Generated audio is silent (all zeros)")
|
217 |
audio = audio / max_value
|
218 |
+
logger.info(f"Normalized audio range: [{audio.min()}, {audio.max()}]")
|
219 |
+
|
220 |
+
logger.info("Converting to int16")
|
221 |
audio = (audio * 32767).astype(np.int16)
|
222 |
+
logger.info(f"Int16 audio shape: {audio.shape}, dtype: {audio.dtype}")
|
223 |
|
224 |
+
logger.info("Writing audio to buffer")
|
225 |
buffer = io.BytesIO()
|
226 |
sf.write(buffer, audio, sample_rate, format='wav')
|
227 |
buffer.seek(0)
|
228 |
+
logger.info(f"Buffer size: {buffer.getbuffer().nbytes} bytes")
|
229 |
|
230 |
# Generate a unique filename
|
231 |
filename = f"synthesized_audio_{int(time.time())}.wav"
|
232 |
|
233 |
+
# Upload to S3 without ACL
|
234 |
try:
|
235 |
s3_client.upload_fileobj(
|
236 |
buffer,
|
|
|
239 |
ExtraArgs={'ContentType': 'audio/wav'}
|
240 |
)
|
241 |
logger.info(f"File uploaded successfully to S3: {filename}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
242 |
|
243 |
# Generate the public URL with the correct format
|
244 |
url = f"https://s3.{S3_REGION}.amazonaws.com/{S3_BUCKET}/{filename}"
|
|
|
279 |
start_time = time.time()
|
280 |
try:
|
281 |
input_bytes = base64.b64decode(request.audio)
|
282 |
+
audio_array, sample_rate = load_audio_file(input_bytes) # Using load_audio_file
|
283 |
result = identify(audio_array)
|
284 |
processing_time = time.time() - start_time
|
285 |
return JSONResponse(content={"language_identification": result, "processing_time_seconds": processing_time})
|
|
|
303 |
start_time = time.time()
|
304 |
try:
|
305 |
contents = await file.read()
|
306 |
+
audio_array, sample_rate = load_audio_file(contents) # Using load_audio_file
|
307 |
result = identify(audio_array)
|
308 |
processing_time = time.time() - start_time
|
309 |
return JSONResponse(content={"language_identification": result, "processing_time_seconds": processing_time})
|