Varun Aditya Balaji
commited on
Commit
β’
836ffb0
1
Parent(s):
1f91072
basic pipeline done
Browse files- .ipynb_checkpoints/Fine_Tune_XLSR_Wav2Vec2-checkpoint.ipynb +0 -0
- .ipynb_checkpoints/Pipeline-checkpoint.ipynb +85 -0
- .ipynb_checkpoints/Speech_to_text-checkpoint.ipynb +6 -0
- .ipynb_checkpoints/Testing_Models-checkpoint.ipynb +235 -0
- 1.wav +0 -0
- Fine_Tune_XLSR_Wav2Vec2.ipynb +0 -0
- Pipeline.ipynb +191 -0
- Testing_Models.ipynb +235 -0
- english.wav +0 -0
- english_fine_tune +1 -0
- lang-id-voxlingua107-ecapa +1 -0
- wav2vec2-large-xlsr-chinese +1 -0
.ipynb_checkpoints/Fine_Tune_XLSR_Wav2Vec2-checkpoint.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
.ipynb_checkpoints/Pipeline-checkpoint.ipynb
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"id": "6afcd792",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"import librosa\n",
|
11 |
+
"import torch\n",
|
12 |
+
"from transformers import Wav2Vec2Processor, HubertForCTC\n",
|
13 |
+
"from huggingsound import SpeechRecognitionModel\n",
|
14 |
+
"model = SpeechRecognitionModel(\"./wav2vec2-large-xlsr-chinese\")\n",
|
15 |
+
"audio_paths = [\"1.wav\"]\n",
|
16 |
+
"transcriptions = model.transcribe(audio_paths)\n",
|
17 |
+
"\n",
|
18 |
+
"\n",
|
19 |
+
"input_audio, sr = librosa.load('english.wav', sr = 16000)\n",
|
20 |
+
"input_values = processor(input_audio, return_tensors=\"pt\").input_values # Batch size 1\n",
|
21 |
+
"logits = model(input_values).logits\n",
|
22 |
+
"predicted_ids = torch.argmax(logits, dim=-1)\n",
|
23 |
+
"transcription = processor.decode(predicted_ids[0])"
|
24 |
+
]
|
25 |
+
},
|
26 |
+
{
|
27 |
+
"cell_type": "code",
|
28 |
+
"execution_count": null,
|
29 |
+
"id": "e1831eab",
|
30 |
+
"metadata": {},
|
31 |
+
"outputs": [],
|
32 |
+
"source": [
|
33 |
+
"\n",
|
34 |
+
"model = SpeechRecognitionModel(\"./wav2vec2-large-xlsr-chinese\")\n",
|
35 |
+
"\n",
|
36 |
+
"\n",
|
37 |
+
"processor = Wav2Vec2Processor.from_pretrained(\"./english_fine_tune\")\n",
|
38 |
+
"model = HubertForCTC.from_pretrained(\"./english_fine_tune\")"
|
39 |
+
]
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"cell_type": "code",
|
43 |
+
"execution_count": null,
|
44 |
+
"id": "8b7aee3c",
|
45 |
+
"metadata": {},
|
46 |
+
"outputs": [],
|
47 |
+
"source": [
|
48 |
+
"def pipeline(path_to_audio):\n",
|
49 |
+
" \n",
|
50 |
+
" \n",
|
51 |
+
" if :\n",
|
52 |
+
" input_audio, sr = librosa.load(path_to_audio, sr = 16000)\n",
|
53 |
+
" input_values = processor(input_audio, return_tensors=\"pt\").input_values # Batch size 1\n",
|
54 |
+
" logits = model(input_values).logits\n",
|
55 |
+
" predicted_ids = torch.argmax(logits, dim=-1)\n",
|
56 |
+
" transcription = processor.decode(predicted_ids[0])\n",
|
57 |
+
" \n",
|
58 |
+
" else:\n",
|
59 |
+
" transcriptions = model.transcribe([path_to_audio])\n",
|
60 |
+
" "
|
61 |
+
]
|
62 |
+
}
|
63 |
+
],
|
64 |
+
"metadata": {
|
65 |
+
"kernelspec": {
|
66 |
+
"display_name": "Python 3 (ipykernel)",
|
67 |
+
"language": "python",
|
68 |
+
"name": "python3"
|
69 |
+
},
|
70 |
+
"language_info": {
|
71 |
+
"codemirror_mode": {
|
72 |
+
"name": "ipython",
|
73 |
+
"version": 3
|
74 |
+
},
|
75 |
+
"file_extension": ".py",
|
76 |
+
"mimetype": "text/x-python",
|
77 |
+
"name": "python",
|
78 |
+
"nbconvert_exporter": "python",
|
79 |
+
"pygments_lexer": "ipython3",
|
80 |
+
"version": "3.8.13"
|
81 |
+
}
|
82 |
+
},
|
83 |
+
"nbformat": 4,
|
84 |
+
"nbformat_minor": 5
|
85 |
+
}
|
.ipynb_checkpoints/Speech_to_text-checkpoint.ipynb
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [],
|
3 |
+
"metadata": {},
|
4 |
+
"nbformat": 4,
|
5 |
+
"nbformat_minor": 5
|
6 |
+
}
|
.ipynb_checkpoints/Testing_Models-checkpoint.ipynb
ADDED
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"id": "ac7631cc",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"import torch\n",
|
11 |
+
"import re\n",
|
12 |
+
"import librosa\n",
|
13 |
+
"from datasets import load_dataset, load_metric\n",
|
14 |
+
"from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor\n",
|
15 |
+
"import warnings\n",
|
16 |
+
"import os\n",
|
17 |
+
"\n",
|
18 |
+
"\n",
|
19 |
+
"LANG_ID = \"zh-CN\"\n",
|
20 |
+
"MODEL_ID = \"zh-CN-output-aishell\"\n",
|
21 |
+
"\n",
|
22 |
+
"test_dataset = load_dataset(\"common_voice\", LANG_ID, split=\"test\")\n",
|
23 |
+
"\n",
|
24 |
+
"wer = load_metric(\"wer\")\n",
|
25 |
+
"cer = load_metric(\"cer\")\n",
|
26 |
+
"\n",
|
27 |
+
"\n",
|
28 |
+
"\n",
|
29 |
+
"processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)\n",
|
30 |
+
"model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)\n",
|
31 |
+
"model.to(DEVICE)\n",
|
32 |
+
"\n",
|
33 |
+
"# Preprocessing the datasets.\n",
|
34 |
+
"# We need to read the audio files as arrays\n",
|
35 |
+
"def speech_file_to_array_fn(batch):\n",
|
36 |
+
" with warnings.catch_warnings():\n",
|
37 |
+
" warnings.simplefilter(\"ignore\")\n",
|
38 |
+
" speech_array, sampling_rate = librosa.load(batch[\"path\"], sr=16_000)\n",
|
39 |
+
" batch[\"speech\"] = speech_array\n",
|
40 |
+
" batch[\"sentence\"] = (\n",
|
41 |
+
" re.sub(\"([^\\u4e00-\\u9fa5\\u0030-\\u0039])\", \"\", batch[\"sentence\"]).lower() + \" \"\n",
|
42 |
+
" )\n",
|
43 |
+
" return batch\n",
|
44 |
+
"\n",
|
45 |
+
"\n",
|
46 |
+
"test_dataset = test_dataset.map(\n",
|
47 |
+
" speech_file_to_array_fn,\n",
|
48 |
+
" num_proc=15,\n",
|
49 |
+
" remove_columns=['client_id', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],\n",
|
50 |
+
")\n",
|
51 |
+
"\n",
|
52 |
+
"# Preprocessing the datasets.\n",
|
53 |
+
"# We need to read the audio files as arrays\n",
|
54 |
+
"def evaluate(batch):\n",
|
55 |
+
" inputs = processor(\n",
|
56 |
+
" batch[\"speech\"], sampling_rate=16_000, return_tensors=\"pt\", padding=True\n",
|
57 |
+
" )\n",
|
58 |
+
"\n",
|
59 |
+
" with torch.no_grad():\n",
|
60 |
+
" logits = model(\n",
|
61 |
+
" inputs.input_values.to(DEVICE),\n",
|
62 |
+
" attention_mask=inputs.attention_mask.to(DEVICE),\n",
|
63 |
+
" ).logits\n",
|
64 |
+
"\n",
|
65 |
+
" pred_ids = torch.argmax(logits, dim=-1)\n",
|
66 |
+
" batch[\"pred_strings\"] = processor.batch_decode(pred_ids)\n",
|
67 |
+
" return batch\n",
|
68 |
+
"\n",
|
69 |
+
"\n",
|
70 |
+
"result = test_dataset.map(evaluate, batched=True, batch_size=8)\n",
|
71 |
+
"\n",
|
72 |
+
"predictions = [x.lower() for x in result[\"pred_strings\"]]\n",
|
73 |
+
"references = [x.lower() for x in result[\"sentence\"]]\n",
|
74 |
+
"\n",
|
75 |
+
"print(\n",
|
76 |
+
" f\"WER: {wer.compute(predictions=predictions, references=references, chunk_size=1000) * 100}\"\n",
|
77 |
+
")\n",
|
78 |
+
"print(f\"CER: {cer.compute(predictions=predictions, references=references) * 100}\")"
|
79 |
+
]
|
80 |
+
},
|
81 |
+
{
|
82 |
+
"cell_type": "code",
|
83 |
+
"execution_count": 15,
|
84 |
+
"id": "7db04701",
|
85 |
+
"metadata": {},
|
86 |
+
"outputs": [
|
87 |
+
{
|
88 |
+
"name": "stdout",
|
89 |
+
"output_type": "stream",
|
90 |
+
"text": [
|
91 |
+
"11/08/2022 09:41:20 - INFO - huggingsound.speech_recognition.model - Loading model...\n"
|
92 |
+
]
|
93 |
+
},
|
94 |
+
{
|
95 |
+
"name": "stderr",
|
96 |
+
"output_type": "stream",
|
97 |
+
"text": [
|
98 |
+
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
|
99 |
+
]
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"name": "stdout",
|
103 |
+
"output_type": "stream",
|
104 |
+
"text": [
|
105 |
+
"11/08/2022 09:41:23 - WARNING - root - bos_token <s> not in provided tokens. It will be added to the list of tokens\n",
|
106 |
+
"11/08/2022 09:41:23 - WARNING - root - eos_token </s> not in provided tokens. It will be added to the list of tokens\n"
|
107 |
+
]
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"name": "stderr",
|
111 |
+
"output_type": "stream",
|
112 |
+
"text": [
|
113 |
+
"100%|βββββββββββββββββββββββββββββββββββββββββββββββ| 1/1 [00:00<00:00, 2.11it/s]\n"
|
114 |
+
]
|
115 |
+
}
|
116 |
+
],
|
117 |
+
"source": [
|
118 |
+
"from huggingsound import SpeechRecognitionModel\n",
|
119 |
+
"model = SpeechRecognitionModel(\"./wav2vec2-large-xlsr-chinese\")\n",
|
120 |
+
"audio_paths = [\"1.wav\"]\n",
|
121 |
+
"transcriptions = model.transcribe(audio_paths)"
|
122 |
+
]
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"cell_type": "code",
|
126 |
+
"execution_count": 19,
|
127 |
+
"id": "23316152",
|
128 |
+
"metadata": {},
|
129 |
+
"outputs": [
|
130 |
+
{
|
131 |
+
"data": {
|
132 |
+
"text/plain": [
|
133 |
+
"'δ½ εζ¬’ι₯ε'"
|
134 |
+
]
|
135 |
+
},
|
136 |
+
"execution_count": 19,
|
137 |
+
"metadata": {},
|
138 |
+
"output_type": "execute_result"
|
139 |
+
}
|
140 |
+
],
|
141 |
+
"source": [
|
142 |
+
"# transcriptions[0]['transcription'].replace('[PAD]','')\n",
|
143 |
+
"transcriptions[0]['transcription']"
|
144 |
+
]
|
145 |
+
},
|
146 |
+
{
|
147 |
+
"cell_type": "code",
|
148 |
+
"execution_count": 24,
|
149 |
+
"id": "730d4afa",
|
150 |
+
"metadata": {},
|
151 |
+
"outputs": [],
|
152 |
+
"source": [
|
153 |
+
"import torch\n",
|
154 |
+
"from transformers import Wav2Vec2Processor, HubertForCTC\n",
|
155 |
+
"from datasets import load_dataset\n",
|
156 |
+
"\n",
|
157 |
+
"processor = Wav2Vec2Processor.from_pretrained(\"./english_fine_tune\")\n",
|
158 |
+
"model = HubertForCTC.from_pretrained(\"./english_fine_tune\")"
|
159 |
+
]
|
160 |
+
},
|
161 |
+
{
|
162 |
+
"cell_type": "code",
|
163 |
+
"execution_count": 25,
|
164 |
+
"id": "f45768e8",
|
165 |
+
"metadata": {},
|
166 |
+
"outputs": [
|
167 |
+
{
|
168 |
+
"name": "stderr",
|
169 |
+
"output_type": "stream",
|
170 |
+
"text": [
|
171 |
+
"It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.\n"
|
172 |
+
]
|
173 |
+
}
|
174 |
+
],
|
175 |
+
"source": [
|
176 |
+
"import librosa\n",
|
177 |
+
"input_audio, sr = librosa.load('english.wav', sr = 16000)\n",
|
178 |
+
"input_values = processor(input_audio, return_tensors=\"pt\").input_values # Batch size 1\n",
|
179 |
+
"logits = model(input_values).logits\n",
|
180 |
+
"predicted_ids = torch.argmax(logits, dim=-1)\n",
|
181 |
+
"transcription = processor.decode(predicted_ids[0])"
|
182 |
+
]
|
183 |
+
},
|
184 |
+
{
|
185 |
+
"cell_type": "code",
|
186 |
+
"execution_count": 26,
|
187 |
+
"id": "8bd98a38",
|
188 |
+
"metadata": {},
|
189 |
+
"outputs": [
|
190 |
+
{
|
191 |
+
"data": {
|
192 |
+
"text/plain": [
|
193 |
+
"'WITHOUT THE DATA SET THE ARTICLE IS USELESS'"
|
194 |
+
]
|
195 |
+
},
|
196 |
+
"execution_count": 26,
|
197 |
+
"metadata": {},
|
198 |
+
"output_type": "execute_result"
|
199 |
+
}
|
200 |
+
],
|
201 |
+
"source": [
|
202 |
+
"transcription"
|
203 |
+
]
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"cell_type": "code",
|
207 |
+
"execution_count": null,
|
208 |
+
"id": "db6a5667",
|
209 |
+
"metadata": {},
|
210 |
+
"outputs": [],
|
211 |
+
"source": []
|
212 |
+
}
|
213 |
+
],
|
214 |
+
"metadata": {
|
215 |
+
"kernelspec": {
|
216 |
+
"display_name": "Python 3 (ipykernel)",
|
217 |
+
"language": "python",
|
218 |
+
"name": "python3"
|
219 |
+
},
|
220 |
+
"language_info": {
|
221 |
+
"codemirror_mode": {
|
222 |
+
"name": "ipython",
|
223 |
+
"version": 3
|
224 |
+
},
|
225 |
+
"file_extension": ".py",
|
226 |
+
"mimetype": "text/x-python",
|
227 |
+
"name": "python",
|
228 |
+
"nbconvert_exporter": "python",
|
229 |
+
"pygments_lexer": "ipython3",
|
230 |
+
"version": "3.8.13"
|
231 |
+
}
|
232 |
+
},
|
233 |
+
"nbformat": 4,
|
234 |
+
"nbformat_minor": 5
|
235 |
+
}
|
1.wav
ADDED
Binary file (203 kB). View file
|
|
Fine_Tune_XLSR_Wav2Vec2.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
Pipeline.ipynb
ADDED
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 41,
|
6 |
+
"id": "edc2e2ff",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"import librosa\n",
|
11 |
+
"import torch\n",
|
12 |
+
"from transformers import Wav2Vec2Processor, HubertForCTC\n",
|
13 |
+
"from huggingsound import SpeechRecognitionModel\n",
|
14 |
+
"import torchaudio\n",
|
15 |
+
"from speechbrain.pretrained import EncoderClassifier\n",
|
16 |
+
"import time\n",
|
17 |
+
"from transformers import Pipeline"
|
18 |
+
]
|
19 |
+
},
|
20 |
+
{
|
21 |
+
"cell_type": "code",
|
22 |
+
"execution_count": 19,
|
23 |
+
"id": "76f25cc3",
|
24 |
+
"metadata": {},
|
25 |
+
"outputs": [
|
26 |
+
{
|
27 |
+
"name": "stdout",
|
28 |
+
"output_type": "stream",
|
29 |
+
"text": [
|
30 |
+
"11/08/2022 14:17:47 - INFO - huggingsound.speech_recognition.model - Loading model...\n"
|
31 |
+
]
|
32 |
+
},
|
33 |
+
{
|
34 |
+
"name": "stderr",
|
35 |
+
"output_type": "stream",
|
36 |
+
"text": [
|
37 |
+
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
|
38 |
+
]
|
39 |
+
},
|
40 |
+
{
|
41 |
+
"name": "stdout",
|
42 |
+
"output_type": "stream",
|
43 |
+
"text": [
|
44 |
+
"11/08/2022 14:17:49 - WARNING - root - bos_token <s> not in provided tokens. It will be added to the list of tokens\n",
|
45 |
+
"11/08/2022 14:17:49 - WARNING - root - eos_token </s> not in provided tokens. It will be added to the list of tokens\n"
|
46 |
+
]
|
47 |
+
}
|
48 |
+
],
|
49 |
+
"source": [
|
50 |
+
"model_chinese = SpeechRecognitionModel(\"./wav2vec2-large-xlsr-chinese\")\n",
|
51 |
+
"processor = Wav2Vec2Processor.from_pretrained(\"./english_fine_tune\")\n",
|
52 |
+
"model = HubertForCTC.from_pretrained(\"./english_fine_tune\")\n",
|
53 |
+
"language_id = EncoderClassifier.from_hparams(source=\"speechbrain/lang-id-voxlingua107-ecapa\", savedir=\"tmp\")"
|
54 |
+
]
|
55 |
+
},
|
56 |
+
{
|
57 |
+
"cell_type": "code",
|
58 |
+
"execution_count": 38,
|
59 |
+
"id": "3b142546",
|
60 |
+
"metadata": {},
|
61 |
+
"outputs": [],
|
62 |
+
"source": [
|
63 |
+
"def pipeline(path_to_audio):\n",
|
64 |
+
" signal = language_id.load_audio(path_to_audio)\n",
|
65 |
+
" prediction = language_id.classify_batch(signal)\n",
|
66 |
+
" prediction[3]\n",
|
67 |
+
" \n",
|
68 |
+
" if prediction[3][0] == 'zh: Chinese':\n",
|
69 |
+
" print('Detected Language is Chinese')\n",
|
70 |
+
" transcriptions = model_chinese.transcribe([path_to_audio])\n",
|
71 |
+
" print(transcriptions[0]['transcription'])\n",
|
72 |
+
" else:\n",
|
73 |
+
" print('Detected language is English')\n",
|
74 |
+
" input_audio, sr = librosa.load(path_to_audio, sr = 16000)\n",
|
75 |
+
" input_values = processor(input_audio, return_tensors=\"pt\").input_values \n",
|
76 |
+
" logits = model(input_values).logits\n",
|
77 |
+
" predicted_ids = torch.argmax(logits, dim=-1)\n",
|
78 |
+
" transcription = processor.decode(predicted_ids[0])\n",
|
79 |
+
" print(transcription)\n",
|
80 |
+
" "
|
81 |
+
]
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"cell_type": "code",
|
85 |
+
"execution_count": 39,
|
86 |
+
"id": "48bed0f8",
|
87 |
+
"metadata": {},
|
88 |
+
"outputs": [
|
89 |
+
{
|
90 |
+
"name": "stderr",
|
91 |
+
"output_type": "stream",
|
92 |
+
"text": [
|
93 |
+
"It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.\n"
|
94 |
+
]
|
95 |
+
},
|
96 |
+
{
|
97 |
+
"name": "stdout",
|
98 |
+
"output_type": "stream",
|
99 |
+
"text": [
|
100 |
+
"Detected language is English\n",
|
101 |
+
"WITHOUT THE DATA SET THE ARTICLE IS USELESS\n"
|
102 |
+
]
|
103 |
+
}
|
104 |
+
],
|
105 |
+
"source": [
|
106 |
+
"class Speech_to_Text(Pipeline):\n",
|
107 |
+
" def postprocess(self,model_outputs):\n",
|
108 |
+
" if prediction[3][0] == 'zh: Chinese':\n",
|
109 |
+
" "
|
110 |
+
]
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"cell_type": "code",
|
114 |
+
"execution_count": 51,
|
115 |
+
"id": "b0fae1dd",
|
116 |
+
"metadata": {},
|
117 |
+
"outputs": [
|
118 |
+
{
|
119 |
+
"name": "stderr",
|
120 |
+
"output_type": "stream",
|
121 |
+
"text": [
|
122 |
+
"It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.\n"
|
123 |
+
]
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"name": "stdout",
|
127 |
+
"output_type": "stream",
|
128 |
+
"text": [
|
129 |
+
"Detected language is English\n",
|
130 |
+
"WITHOUT THE DATA SET THE ARTICLE IS USELESS\n"
|
131 |
+
]
|
132 |
+
}
|
133 |
+
],
|
134 |
+
"source": [
|
135 |
+
"start = time.time()\n",
|
136 |
+
"pipeline('english.wav')\n",
|
137 |
+
"end = time.time()"
|
138 |
+
]
|
139 |
+
},
|
140 |
+
{
|
141 |
+
"cell_type": "code",
|
142 |
+
"execution_count": 49,
|
143 |
+
"id": "1e0321b5",
|
144 |
+
"metadata": {},
|
145 |
+
"outputs": [
|
146 |
+
{
|
147 |
+
"data": {
|
148 |
+
"text/plain": [
|
149 |
+
"0.5424931049346924"
|
150 |
+
]
|
151 |
+
},
|
152 |
+
"execution_count": 49,
|
153 |
+
"metadata": {},
|
154 |
+
"output_type": "execute_result"
|
155 |
+
}
|
156 |
+
],
|
157 |
+
"source": [
|
158 |
+
"end - start"
|
159 |
+
]
|
160 |
+
},
|
161 |
+
{
|
162 |
+
"cell_type": "code",
|
163 |
+
"execution_count": null,
|
164 |
+
"id": "a069a0fd",
|
165 |
+
"metadata": {},
|
166 |
+
"outputs": [],
|
167 |
+
"source": []
|
168 |
+
}
|
169 |
+
],
|
170 |
+
"metadata": {
|
171 |
+
"kernelspec": {
|
172 |
+
"display_name": "Python 3 (ipykernel)",
|
173 |
+
"language": "python",
|
174 |
+
"name": "python3"
|
175 |
+
},
|
176 |
+
"language_info": {
|
177 |
+
"codemirror_mode": {
|
178 |
+
"name": "ipython",
|
179 |
+
"version": 3
|
180 |
+
},
|
181 |
+
"file_extension": ".py",
|
182 |
+
"mimetype": "text/x-python",
|
183 |
+
"name": "python",
|
184 |
+
"nbconvert_exporter": "python",
|
185 |
+
"pygments_lexer": "ipython3",
|
186 |
+
"version": "3.8.13"
|
187 |
+
}
|
188 |
+
},
|
189 |
+
"nbformat": 4,
|
190 |
+
"nbformat_minor": 5
|
191 |
+
}
|
Testing_Models.ipynb
ADDED
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"id": "ac7631cc",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"import torch\n",
|
11 |
+
"import re\n",
|
12 |
+
"import librosa\n",
|
13 |
+
"from datasets import load_dataset, load_metric\n",
|
14 |
+
"from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor\n",
|
15 |
+
"import warnings\n",
|
16 |
+
"import os\n",
|
17 |
+
"\n",
|
18 |
+
"\n",
|
19 |
+
"LANG_ID = \"zh-CN\"\n",
|
20 |
+
"MODEL_ID = \"zh-CN-output-aishell\"\n",
|
21 |
+
"\n",
|
22 |
+
"test_dataset = load_dataset(\"common_voice\", LANG_ID, split=\"test\")\n",
|
23 |
+
"\n",
|
24 |
+
"wer = load_metric(\"wer\")\n",
|
25 |
+
"cer = load_metric(\"cer\")\n",
|
26 |
+
"\n",
|
27 |
+
"\n",
|
28 |
+
"\n",
|
29 |
+
"processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)\n",
|
30 |
+
"model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)\n",
|
31 |
+
"model.to(DEVICE)\n",
|
32 |
+
"\n",
|
33 |
+
"# Preprocessing the datasets.\n",
|
34 |
+
"# We need to read the audio files as arrays\n",
|
35 |
+
"def speech_file_to_array_fn(batch):\n",
|
36 |
+
" with warnings.catch_warnings():\n",
|
37 |
+
" warnings.simplefilter(\"ignore\")\n",
|
38 |
+
" speech_array, sampling_rate = librosa.load(batch[\"path\"], sr=16_000)\n",
|
39 |
+
" batch[\"speech\"] = speech_array\n",
|
40 |
+
" batch[\"sentence\"] = (\n",
|
41 |
+
" re.sub(\"([^\\u4e00-\\u9fa5\\u0030-\\u0039])\", \"\", batch[\"sentence\"]).lower() + \" \"\n",
|
42 |
+
" )\n",
|
43 |
+
" return batch\n",
|
44 |
+
"\n",
|
45 |
+
"\n",
|
46 |
+
"test_dataset = test_dataset.map(\n",
|
47 |
+
" speech_file_to_array_fn,\n",
|
48 |
+
" num_proc=15,\n",
|
49 |
+
" remove_columns=['client_id', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],\n",
|
50 |
+
")\n",
|
51 |
+
"\n",
|
52 |
+
"# Preprocessing the datasets.\n",
|
53 |
+
"# We need to read the audio files as arrays\n",
|
54 |
+
"def evaluate(batch):\n",
|
55 |
+
" inputs = processor(\n",
|
56 |
+
" batch[\"speech\"], sampling_rate=16_000, return_tensors=\"pt\", padding=True\n",
|
57 |
+
" )\n",
|
58 |
+
"\n",
|
59 |
+
" with torch.no_grad():\n",
|
60 |
+
" logits = model(\n",
|
61 |
+
" inputs.input_values.to(DEVICE),\n",
|
62 |
+
" attention_mask=inputs.attention_mask.to(DEVICE),\n",
|
63 |
+
" ).logits\n",
|
64 |
+
"\n",
|
65 |
+
" pred_ids = torch.argmax(logits, dim=-1)\n",
|
66 |
+
" batch[\"pred_strings\"] = processor.batch_decode(pred_ids)\n",
|
67 |
+
" return batch\n",
|
68 |
+
"\n",
|
69 |
+
"\n",
|
70 |
+
"result = test_dataset.map(evaluate, batched=True, batch_size=8)\n",
|
71 |
+
"\n",
|
72 |
+
"predictions = [x.lower() for x in result[\"pred_strings\"]]\n",
|
73 |
+
"references = [x.lower() for x in result[\"sentence\"]]\n",
|
74 |
+
"\n",
|
75 |
+
"print(\n",
|
76 |
+
" f\"WER: {wer.compute(predictions=predictions, references=references, chunk_size=1000) * 100}\"\n",
|
77 |
+
")\n",
|
78 |
+
"print(f\"CER: {cer.compute(predictions=predictions, references=references) * 100}\")"
|
79 |
+
]
|
80 |
+
},
|
81 |
+
{
|
82 |
+
"cell_type": "code",
|
83 |
+
"execution_count": 15,
|
84 |
+
"id": "7db04701",
|
85 |
+
"metadata": {},
|
86 |
+
"outputs": [
|
87 |
+
{
|
88 |
+
"name": "stdout",
|
89 |
+
"output_type": "stream",
|
90 |
+
"text": [
|
91 |
+
"11/08/2022 09:41:20 - INFO - huggingsound.speech_recognition.model - Loading model...\n"
|
92 |
+
]
|
93 |
+
},
|
94 |
+
{
|
95 |
+
"name": "stderr",
|
96 |
+
"output_type": "stream",
|
97 |
+
"text": [
|
98 |
+
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
|
99 |
+
]
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"name": "stdout",
|
103 |
+
"output_type": "stream",
|
104 |
+
"text": [
|
105 |
+
"11/08/2022 09:41:23 - WARNING - root - bos_token <s> not in provided tokens. It will be added to the list of tokens\n",
|
106 |
+
"11/08/2022 09:41:23 - WARNING - root - eos_token </s> not in provided tokens. It will be added to the list of tokens\n"
|
107 |
+
]
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"name": "stderr",
|
111 |
+
"output_type": "stream",
|
112 |
+
"text": [
|
113 |
+
"100%|βββββββββββββββββββββββββββββββββββββββββββββββ| 1/1 [00:00<00:00, 2.11it/s]\n"
|
114 |
+
]
|
115 |
+
}
|
116 |
+
],
|
117 |
+
"source": [
|
118 |
+
"from huggingsound import SpeechRecognitionModel\n",
|
119 |
+
"model = SpeechRecognitionModel(\"./wav2vec2-large-xlsr-chinese\")\n",
|
120 |
+
"audio_paths = [\"1.wav\"]\n",
|
121 |
+
"transcriptions = model.transcribe(audio_paths)"
|
122 |
+
]
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"cell_type": "code",
|
126 |
+
"execution_count": 19,
|
127 |
+
"id": "23316152",
|
128 |
+
"metadata": {},
|
129 |
+
"outputs": [
|
130 |
+
{
|
131 |
+
"data": {
|
132 |
+
"text/plain": [
|
133 |
+
"'δ½ εζ¬’ι₯ε'"
|
134 |
+
]
|
135 |
+
},
|
136 |
+
"execution_count": 19,
|
137 |
+
"metadata": {},
|
138 |
+
"output_type": "execute_result"
|
139 |
+
}
|
140 |
+
],
|
141 |
+
"source": [
|
142 |
+
"# transcriptions[0]['transcription'].replace('[PAD]','')\n",
|
143 |
+
"transcriptions[0]['transcription']"
|
144 |
+
]
|
145 |
+
},
|
146 |
+
{
|
147 |
+
"cell_type": "code",
|
148 |
+
"execution_count": 24,
|
149 |
+
"id": "730d4afa",
|
150 |
+
"metadata": {},
|
151 |
+
"outputs": [],
|
152 |
+
"source": [
|
153 |
+
"import torch\n",
|
154 |
+
"from transformers import Wav2Vec2Processor, HubertForCTC\n",
|
155 |
+
"from datasets import load_dataset\n",
|
156 |
+
"\n",
|
157 |
+
"processor = Wav2Vec2Processor.from_pretrained(\"./english_fine_tune\")\n",
|
158 |
+
"model = HubertForCTC.from_pretrained(\"./english_fine_tune\")"
|
159 |
+
]
|
160 |
+
},
|
161 |
+
{
|
162 |
+
"cell_type": "code",
|
163 |
+
"execution_count": 25,
|
164 |
+
"id": "f45768e8",
|
165 |
+
"metadata": {},
|
166 |
+
"outputs": [
|
167 |
+
{
|
168 |
+
"name": "stderr",
|
169 |
+
"output_type": "stream",
|
170 |
+
"text": [
|
171 |
+
"It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.\n"
|
172 |
+
]
|
173 |
+
}
|
174 |
+
],
|
175 |
+
"source": [
|
176 |
+
"import librosa\n",
|
177 |
+
"input_audio, sr = librosa.load('english.wav', sr = 16000)\n",
|
178 |
+
"input_values = processor(input_audio, return_tensors=\"pt\").input_values # Batch size 1\n",
|
179 |
+
"logits = model(input_values).logits\n",
|
180 |
+
"predicted_ids = torch.argmax(logits, dim=-1)\n",
|
181 |
+
"transcription = processor.decode(predicted_ids[0])"
|
182 |
+
]
|
183 |
+
},
|
184 |
+
{
|
185 |
+
"cell_type": "code",
|
186 |
+
"execution_count": 26,
|
187 |
+
"id": "8bd98a38",
|
188 |
+
"metadata": {},
|
189 |
+
"outputs": [
|
190 |
+
{
|
191 |
+
"data": {
|
192 |
+
"text/plain": [
|
193 |
+
"'WITHOUT THE DATA SET THE ARTICLE IS USELESS'"
|
194 |
+
]
|
195 |
+
},
|
196 |
+
"execution_count": 26,
|
197 |
+
"metadata": {},
|
198 |
+
"output_type": "execute_result"
|
199 |
+
}
|
200 |
+
],
|
201 |
+
"source": [
|
202 |
+
"transcription"
|
203 |
+
]
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"cell_type": "code",
|
207 |
+
"execution_count": null,
|
208 |
+
"id": "db6a5667",
|
209 |
+
"metadata": {},
|
210 |
+
"outputs": [],
|
211 |
+
"source": []
|
212 |
+
}
|
213 |
+
],
|
214 |
+
"metadata": {
|
215 |
+
"kernelspec": {
|
216 |
+
"display_name": "Python 3 (ipykernel)",
|
217 |
+
"language": "python",
|
218 |
+
"name": "python3"
|
219 |
+
},
|
220 |
+
"language_info": {
|
221 |
+
"codemirror_mode": {
|
222 |
+
"name": "ipython",
|
223 |
+
"version": 3
|
224 |
+
},
|
225 |
+
"file_extension": ".py",
|
226 |
+
"mimetype": "text/x-python",
|
227 |
+
"name": "python",
|
228 |
+
"nbconvert_exporter": "python",
|
229 |
+
"pygments_lexer": "ipython3",
|
230 |
+
"version": "3.8.13"
|
231 |
+
}
|
232 |
+
},
|
233 |
+
"nbformat": 4,
|
234 |
+
"nbformat_minor": 5
|
235 |
+
}
|
english.wav
ADDED
Binary file (309 kB). View file
|
|
english_fine_tune
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Subproject commit ece5fabbf034c1073acae96d5401b25be96709d8
|
lang-id-voxlingua107-ecapa
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Subproject commit d771b530cec097adc0088b4dbd173e242f895464
|
wav2vec2-large-xlsr-chinese
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Subproject commit 369f73139f85a98570ff74e641dc93d421a3860e
|