Spaces:
Runtime error
Runtime error
import openai, os | |
import gradio as gr | |
import time | |
import boto3 | |
import json | |
import numpy as np | |
import wave | |
import io | |
import os | |
from langchain import OpenAI | |
from langchain.chains import ConversationChain | |
from langchain.memory import ConversationSummaryBufferMemory | |
from langchain.chat_models import ChatOpenAI | |
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler | |
from langchain.schema import HumanMessage | |
import subprocess | |
from contextlib import closing | |
import asyncio | |
# This example uses aiofile for asynchronous file reads. | |
# It's not a dependency of the project but can be installed | |
# with `pip install aiofile`. | |
import aiofile | |
from amazon_transcribe.client import TranscribeStreamingClient | |
from amazon_transcribe.handlers import TranscriptResultStreamHandler | |
from amazon_transcribe.model import TranscriptEvent | |
from amazon_transcribe.utils import apply_realtime_delay | |
def run_shell_cmd(command): | |
# Run a shell command | |
result = subprocess.run(command, shell=True, capture_output=True, text=True) | |
# Check the command output | |
if result.returncode == 0: | |
print("Command executed successfully") | |
print("Command output:") | |
print(result.stdout) | |
else: | |
print("Command failed") | |
print("Error message:") | |
print(result.stderr) | |
def wav_to_pcm(input_file, output_file): | |
cmd = "ffmpeg -i " + input_file + " -f s16le -ar 16000 -ac 1 -acodec pcm_s16le " + output_file | |
run_shell_cmd(cmd) | |
openai.api_key = os.environ["OPENAI_API_KEY"] | |
polly = boto3.client('polly', region_name='us-east-1') | |
s3 = boto3.client('s3') | |
transcribe = boto3.client('transcribe') | |
memory = ConversationSummaryBufferMemory(llm=ChatOpenAI(), max_token_limit=2048) | |
conversation = ConversationChain( | |
llm=OpenAI(streaming=True, callbacks=[StreamingStdOutCallbackHandler()], max_tokens=2048, temperature=0.5), | |
memory=memory, | |
) | |
SAMPLE_RATE = 16000 | |
BYTES_PER_SAMPLE = 2 | |
CHANNEL_NUMS = 1 | |
AUDIO_PATH = '' | |
CHUNK_SIZE = 1024 * 8 | |
REGION = "us-west-2" | |
transcript_text = '' | |
transcriptions = [] | |
class MyEventHandler(TranscriptResultStreamHandler): | |
def __init__(self, transcript_result_stream): | |
super().__init__(transcript_result_stream) | |
self.transcriptions = [] | |
async def handle_transcript_event(self, transcript_event: TranscriptEvent): | |
# This handler can be implemented to handle transcriptions as needed. | |
# Here's an example to get started. | |
results = transcript_event.transcript.results | |
for result in results: | |
for alt in result.alternatives: | |
print(alt.transcript) | |
transcriptions.append(alt.transcript) | |
async def basic_transcribe(): | |
# Setup up our client with our chosen AWS region | |
client = TranscribeStreamingClient(region=REGION) | |
# Start transcription to generate our async stream | |
stream = await client.start_stream_transcription( | |
language_code="zh-CN", | |
media_sample_rate_hz=SAMPLE_RATE, | |
media_encoding="pcm", | |
) | |
async def write_chunks(): | |
# NOTE: For pre-recorded files longer than 5 minutes, the sent audio | |
# chunks should be rate limited to match the realtime bitrate of the | |
# audio stream to avoid signing issues. | |
async with aiofile.AIOFile(AUDIO_PATH, "rb") as afp: | |
reader = aiofile.Reader(afp, chunk_size=CHUNK_SIZE) | |
await apply_realtime_delay( | |
stream, reader, BYTES_PER_SAMPLE, SAMPLE_RATE, CHANNEL_NUMS | |
) | |
await stream.input_stream.end_stream() | |
# Instantiate our handler and start processing events | |
handler = MyEventHandler(stream.output_stream) | |
await asyncio.gather(write_chunks(), handler.handle_events()) | |
# Retrieve the transcriptions from the handler | |
#transcriptions = handler.transcriptions | |
def download_file(bucket_name, object_key, file_path): | |
try: | |
# Download the file from S3 | |
s3.download_file(bucket_name, object_key, file_path) | |
print(f"File downloaded successfully: {file_path}") | |
except Exception as e: | |
print(f"Error downloading file: {str(e)}") | |
def play_s3_voice(text): | |
response = polly.start_speech_synthesis_task( | |
OutputS3BucketName='lingo-audio-materials', #this bucket is in us-east-1 | |
OutputS3KeyPrefix='answers/', | |
OutputFormat='mp3', | |
Text=text, | |
VoiceId='Zhiyu', | |
LanguageCode='cmn-CN', | |
Engine='neural' | |
) | |
# Print the task ID and status | |
task_id = response['SynthesisTask']['TaskId'] | |
print('Task ID:', task_id) | |
while True: | |
task = polly.get_speech_synthesis_task(TaskId=task_id) | |
task_status = task['SynthesisTask']['TaskStatus'] | |
if task_status == 'completed': | |
break | |
elif task_status == 'failed': | |
# Task failed | |
print('Task failed:', task['SynthesisTask']['TaskStatusReason']) | |
break | |
else: | |
print("Polly synthesis task is still in progress...") | |
time.sleep(1) | |
output_uri = response['SynthesisTask']['OutputUri'] | |
print("polly output_uri:"+output_uri) | |
output_uri = output_uri.replace("https://", "") | |
# Split the URI into bucket name and key | |
results = output_uri.split("/") | |
bucket_name = results[1] | |
key_name = results[2]+'/'+results[3] | |
print("bucket name:"+bucket_name) | |
print("key name:"+key_name) | |
mp3_pre_signed_url = s3.generate_presigned_url('get_object',Params={'Bucket': bucket_name,'Key': key_name},ExpiresIn=3600) | |
print("mp3_pre_signed_url:"+mp3_pre_signed_url) | |
output_file = "/tmp/from-s3.mp3" | |
current_dir = os.getcwd() | |
#file_absolute_path = current_dir+'/'+output_file | |
print("current dir:"+current_dir) | |
print("output_file_location: "+output_file) | |
download_file(bucket_name, key_name, output_file) | |
#encoded_path = file_absolute_path.encode("utf-8") | |
#tmp_aud_file_url = output_file | |
#htm_audio = f'<audio><source src={tmp_aud_file_url} type="audio/mp3" autoplay></audio>' | |
#audio_htm = gr.HTML(htm_audio) | |
return output_file | |
def predict(input, history=[]): | |
history.append(input) | |
response = conversation.predict(input=input) | |
print("GPT response: "+response) | |
history.append(response) | |
audio_file = play_s3_voice(response) | |
responses = [(u,b) for u,b in zip(history[::2], history[1::2])] | |
print("all historical responses: "+str(responses)) | |
return responses, audio_file, history | |
def transcribe_func_new(audio): | |
audio_file = open(audio, "rb") | |
wav_file = audio_file.name | |
print("wav_file: "+wav_file) | |
#transcript = openai.Audio.transcribe("whisper-1", audio_file) | |
#return transcript['text'] | |
pcm_file = os.path.splitext(wav_file)[0] + ".pcm" | |
wav_to_pcm(wav_file, pcm_file) | |
if os.path.exists(file_path): | |
print("pcm file exists") | |
else: | |
print("pcm file does not exist") | |
AUDIO_PATH=pcm_file | |
loop = asyncio.get_event_loop() | |
loop.run_until_complete(basic_transcribe()) | |
loop.close() | |
transcript_text = transcriptions[-1] | |
print("final transcribe script: "+transcript_text) | |
return transcript_text | |
def transcribe_func_old(audio): | |
audio_file = open(audio, "rb") | |
file_name = audio_file.name | |
#file_directory = os.path.dirname(audio_file.name) | |
print("audio_file: "+file_name) | |
#transcript = openai.Audio.transcribe("whisper-1", audio_file) | |
#return transcript['text'] | |
# Set up the job parameters | |
job_name = "lingo-demo" | |
text_output_bucket = 'lingo-text-material' #this bucket is in us-west-1 | |
text_output_key = 'transcriptions/question.json' | |
text_output_key = 'transcriptions/'+job_name+'.json' | |
language_code = 'zh-CN' | |
# Upload the file to an S3 bucket | |
audio_input_bucket_name = "lingo-audio-material" | |
audio_input_s3_key = "questions/tmp-question-from-huggingface.wav" | |
s3.upload_file(file_name, audio_input_bucket_name, audio_input_s3_key) | |
# Construct the S3 bucket URI | |
s3_uri = f"s3://{audio_input_bucket_name}/{audio_input_s3_key}" | |
response = transcribe.list_transcription_jobs() | |
# Iterate through the jobs and print their names | |
for job in response['TranscriptionJobSummaries']: | |
print(job['TranscriptionJobName']) | |
if job['TranscriptionJobName'] == job_name: | |
response = transcribe.delete_transcription_job(TranscriptionJobName=job_name) | |
print("delete transcribe job response:"+str(response)) | |
# Create the transcription job | |
response = transcribe.start_transcription_job( | |
TranscriptionJobName=job_name, | |
Media={'MediaFileUri': s3_uri}, | |
MediaFormat='wav', | |
LanguageCode=language_code, | |
OutputBucketName=text_output_bucket, | |
OutputKey=text_output_key | |
) | |
print("start transcribe job response:"+str(response)) | |
job_name = response["TranscriptionJob"]["TranscriptionJobName"] | |
# Wait for the transcription job to complete | |
while True: | |
status = transcribe.get_transcription_job(TranscriptionJobName=job_name)['TranscriptionJob']['TranscriptionJobStatus'] | |
if status in ['COMPLETED', 'FAILED']: | |
break | |
print("Transcription job still in progress...") | |
time.sleep(1) | |
# Get the transcript | |
#transcript = transcribe.get_transcription_job(TranscriptionJobName=job_name) | |
transcript_uri = transcribe.get_transcription_job(TranscriptionJobName=job_name)['TranscriptionJob']['Transcript']['TranscriptFileUri'] | |
print("transcript uri: " + str(transcript_uri)) | |
transcript_file_content = s3.get_object(Bucket=text_output_bucket, Key=text_output_key)['Body'].read().decode('utf-8') | |
print(transcript_file_content) | |
json_data = json.loads(transcript_file_content) | |
# Extract the transcript value | |
transcript_text = json_data['results']['transcripts'][0]['transcript'] | |
return transcript_text | |
def process_audio(audio, history=[]): | |
text = transcribe_func_new(audio) | |
return predict(text, history) | |
with gr.Blocks(css="#chatbot{height:350px} .overflow-y-auto{height:500px}") as demo: | |
chatbot = gr.Chatbot(elem_id="chatbot") | |
state = gr.State([]) | |
with gr.Row(): | |
txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter").style(container=False) | |
with gr.Row(): | |
audio_input = gr.Audio(source="microphone", type="filepath", label="Audio Input") | |
with gr.Row(): | |
audio_output = gr.Audio(type="filepath", label="Audio Output", elem_id="speaker", interactive=False) | |
#audio_html = gr.HTML() | |
txt.submit(predict, [txt, state], [chatbot, audio_output, state]) | |
audio_input.change(process_audio, [audio_input, state], [chatbot, audio_output, state]) | |
demo.launch(debug=True) | |