Save transcription or translation to txt file
How do I save the results of the asr transcription and translation to a txt file, ie this code
from transformers import pipeline
asr = pipeline("automatic-speech-recognition", "NbAiLabBeta/nb-whisper-large")
asr("audio_king.mp3", generate_kwargs={'task': 'transcribe', 'language': 'no'})
asr("audio_king.mp3", generate_kwargs={'task': 'translate', 'language': 'no'})
Thanks.
transcription_result = asr(input_file, generate_kwargs={'task': 'transcribe', 'language': 'no'})
output_file = "my_transcriptions.txt"
with open(output_file, 'w', encoding='utf-8') as file:
file.write(transcription_result['text'])
This works for those who may need it.
import sys
sys.stdout.write("Imports ...\n")
sys.stdout.flush()
from transformers import pipeline
# Load the model
sys.stdout.write("Loading the model ...\n")
sys.stdout.flush()
asr = pipeline("automatic-speech-recognition", "NbAiLabBeta/nb-whisper-large")
#transcribe
sys.stdout.write("Transcribing ...\n")
sys.stdout.flush()
text = asr("audio_king.mp3", generate_kwargs={'task': 'transcribe', 'language': 'no'})
with open('output_transcribe.txt', 'w+') as fh:
fh.write(text['text'])
#translate
sys.stdout.write("Translating ...\n")
sys.stdout.flush()
text = asr("audio_king.mp3", generate_kwargs={'task': 'translate', 'language': 'en'})
with open('output_translate.txt', 'w+') as fh:
fh.write(text['text'])
sys.stdout.write("Done\n")
sys.stdout.flush()
transcription_result = asr(input_file, generate_kwargs={'task': 'transcribe', 'language': 'no'})
output_file = "my_transcriptions.txt"
with open(output_file, 'w', encoding='utf-8') as file:
file.write(transcription_result['text'])
Thanks, I found the same answer :)
Also how do we tell it to use the GPU?
The above code seems to be CPU only and appending to.("cuda") to the pipline does not work in this case.
asr = pipeline("automatic-speech-recognition", "NbAiLabBeta/nb-whisper-large", device="cuda" ) seems to work
OK the final code to handle larger length mp3 files and use the GPU is...
import sys
sys.stdout.write("Imports ...\n")
sys.stdout.flush()
from transformers import pipeline
# Load the model
sys.stdout.write("Loading the model ...\n")
sys.stdout.flush()
asr = pipeline("automatic-speech-recognition", "NbAiLabBeta/nb-whisper-large", device="cuda")
#transcribe
sys.stdout.write("Transcribing ...\n")
sys.stdout.flush()
text = asr("audio_king.mp3", chunk_length_s=28, return_timestamps=True, generate_kwargs={'num_beams': 5, 'task': 'transcribe', 'language': 'no'})
with open('output_transcribe.txt', 'w+') as fh:
fh.write(text['text'])
#translate
sys.stdout.write("Translating ...\n")
sys.stdout.flush()
text = asr("audio_king.mp3", chunk_length_s=28, return_timestamps=True, generate_kwargs={'num_beams': 5, 'task': 'translate', 'language': 'en'})
with open('output_translate.txt', 'w+') as fh:
fh.write(text['text'])
sys.stdout.write("Done\n")
sys.stdout.flush()
Really nice - I just need to figure out how to get timestamps in the fh.write(text['text'])
output, as return_timestamps=True
doesn't do anything by default, and all translations are in a single line
# Writing output with timestamps
with open('output_transcribe02.txt', 'w', encoding='utf-8') as fh:
for chunk in results['chunks']:
start_time, end_time = chunk['timestamp']
transcribed_text = chunk['text']
fh.write(f"{start_time}-{end_time}: {transcribed_text}\n")
Solved
For completion, here is the final script and the environment setup commands used for anyone else who wants a more complete example to try this locally.
import sys
sys.stdout.write("Imports ...\n")
sys.stdout.flush()
from transformers import pipeline
# Load the model
sys.stdout.write("Loading the model ...\n")
sys.stdout.flush()
asr = pipeline("automatic-speech-recognition", "NbAiLabBeta/nb-whisper-large", device="cuda")
#transcribe
sys.stdout.write("Transcribing ...\n")
sys.stdout.flush()
text = asr("audio_king.mp3", chunk_length_s=28, return_timestamps=True, generate_kwargs={'num_beams': 5, 'task': 'transcribe', 'language': 'no'})
with open('output_transcribe.txt', 'w+') as fh:
fh.write(text['text'])
with open('output_transcribe_timestamps.txt', 'w', encoding='utf-8') as fh:
for chunk in text['chunks']:
start_time, end_time = chunk['timestamp']
transcribed_text = chunk['text']
fh.write(f"{start_time}-{end_time}: {transcribed_text}\n")
#translate
sys.stdout.write("Translating ...\n")
sys.stdout.flush()
text = asr("audio_king.mp3", chunk_length_s=28, return_timestamps=True, generate_kwargs={'num_beams': 5, 'task': 'translate', 'language': 'en'})
with open('output_translate.txt', 'w+') as fh:
fh.write(text['text'])
with open('output_translate_timestamps.txt', 'w', encoding='utf-8') as fh:
for chunk in text['chunks']:
start_time, end_time = chunk['timestamp']
transcribed_text = chunk['text']
fh.write(f"{start_time}-{end_time}: {transcribed_text}\n")
sys.stdout.write("Done\n")
sys.stdout.flush()
Environment setup pip commands used
python -m pip install --upgrade pip
pip install --no-cache-dir --ignore-installed --force-reinstall --no-warn-conflicts wheel==0.40.0
pip install --no-cache-dir --ignore-installed --force-reinstall --no-warn-conflicts numba==0.57.0
pip install --no-cache-dir --ignore-installed --force-reinstall --no-warn-conflicts tqdm==4.65.0
pip install --no-cache-dir --ignore-installed --force-reinstall --no-warn-conflicts transformers==4.35.2
pip install --no-cache-dir --ignore-installed --force-reinstall --no-warn-conflicts torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1+cu116 --extra-index-url https://download.pytorch.org/whl/cu116
For Windows you also need to have ffmpeg.exe in the same folder as the script.
I'm currently using
from transformers import pipeline
asr = pipeline("automatic-speech-recognition", model="NbAiLabBeta/nb-whisper-large")
result = asr("king.mp3", generate_kwargs={'task': 'transcribe', 'language': 'no'})
print(result)
Is it possible to change the language to "sami" to get the sámi transcription? And if so, can I use translation to english simultanously?
@Flameglory , no, unfortunately, while NB-Whisper models can translate North Sámi to Norwegian, they cannot do transcription to Sámi as Sámi is not supported by Whisper. For Sámi transcription a different model is needed. We have a working prototype for it: https://huggingface.co/NbAiLab/whisper-large-sme, but it does not support timestamps at the moment. There's a serverless demo that can be accessed here: https://huggingface.co/spaces/versae/whisper-sami-demo