Varun Aditya Balaji commited on
Commit
836ffb0
β€’
1 Parent(s): 1f91072

basic pipeline done

Browse files
.ipynb_checkpoints/Fine_Tune_XLSR_Wav2Vec2-checkpoint.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
.ipynb_checkpoints/Pipeline-checkpoint.ipynb ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "6afcd792",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import librosa\n",
11
+ "import torch\n",
12
+ "from transformers import Wav2Vec2Processor, HubertForCTC\n",
13
+ "from huggingsound import SpeechRecognitionModel\n",
14
+ "model = SpeechRecognitionModel(\"./wav2vec2-large-xlsr-chinese\")\n",
15
+ "audio_paths = [\"1.wav\"]\n",
16
+ "transcriptions = model.transcribe(audio_paths)\n",
17
+ "\n",
18
+ "\n",
19
+ "input_audio, sr = librosa.load('english.wav', sr = 16000)\n",
20
+ "input_values = processor(input_audio, return_tensors=\"pt\").input_values # Batch size 1\n",
21
+ "logits = model(input_values).logits\n",
22
+ "predicted_ids = torch.argmax(logits, dim=-1)\n",
23
+ "transcription = processor.decode(predicted_ids[0])"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "execution_count": null,
29
+ "id": "e1831eab",
30
+ "metadata": {},
31
+ "outputs": [],
32
+ "source": [
33
+ "\n",
34
+ "model = SpeechRecognitionModel(\"./wav2vec2-large-xlsr-chinese\")\n",
35
+ "\n",
36
+ "\n",
37
+ "processor = Wav2Vec2Processor.from_pretrained(\"./english_fine_tune\")\n",
38
+ "model = HubertForCTC.from_pretrained(\"./english_fine_tune\")"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": null,
44
+ "id": "8b7aee3c",
45
+ "metadata": {},
46
+ "outputs": [],
47
+ "source": [
48
+ "def pipeline(path_to_audio):\n",
49
+ " \n",
50
+ " \n",
51
+ " if :\n",
52
+ " input_audio, sr = librosa.load(path_to_audio, sr = 16000)\n",
53
+ " input_values = processor(input_audio, return_tensors=\"pt\").input_values # Batch size 1\n",
54
+ " logits = model(input_values).logits\n",
55
+ " predicted_ids = torch.argmax(logits, dim=-1)\n",
56
+ " transcription = processor.decode(predicted_ids[0])\n",
57
+ " \n",
58
+ " else:\n",
59
+ " transcriptions = model.transcribe([path_to_audio])\n",
60
+ " "
61
+ ]
62
+ }
63
+ ],
64
+ "metadata": {
65
+ "kernelspec": {
66
+ "display_name": "Python 3 (ipykernel)",
67
+ "language": "python",
68
+ "name": "python3"
69
+ },
70
+ "language_info": {
71
+ "codemirror_mode": {
72
+ "name": "ipython",
73
+ "version": 3
74
+ },
75
+ "file_extension": ".py",
76
+ "mimetype": "text/x-python",
77
+ "name": "python",
78
+ "nbconvert_exporter": "python",
79
+ "pygments_lexer": "ipython3",
80
+ "version": "3.8.13"
81
+ }
82
+ },
83
+ "nbformat": 4,
84
+ "nbformat_minor": 5
85
+ }
.ipynb_checkpoints/Speech_to_text-checkpoint.ipynb ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [],
3
+ "metadata": {},
4
+ "nbformat": 4,
5
+ "nbformat_minor": 5
6
+ }
.ipynb_checkpoints/Testing_Models-checkpoint.ipynb ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "ac7631cc",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import torch\n",
11
+ "import re\n",
12
+ "import librosa\n",
13
+ "from datasets import load_dataset, load_metric\n",
14
+ "from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor\n",
15
+ "import warnings\n",
16
+ "import os\n",
17
+ "\n",
18
+ "\n",
19
+ "LANG_ID = \"zh-CN\"\n",
20
+ "MODEL_ID = \"zh-CN-output-aishell\"\n",
21
+ "\n",
22
+ "test_dataset = load_dataset(\"common_voice\", LANG_ID, split=\"test\")\n",
23
+ "\n",
24
+ "wer = load_metric(\"wer\")\n",
25
+ "cer = load_metric(\"cer\")\n",
26
+ "\n",
27
+ "\n",
28
+ "\n",
29
+ "processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)\n",
30
+ "model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)\n",
31
+ "model.to(DEVICE)\n",
32
+ "\n",
33
+ "# Preprocessing the datasets.\n",
34
+ "# We need to read the audio files as arrays\n",
35
+ "def speech_file_to_array_fn(batch):\n",
36
+ " with warnings.catch_warnings():\n",
37
+ " warnings.simplefilter(\"ignore\")\n",
38
+ " speech_array, sampling_rate = librosa.load(batch[\"path\"], sr=16_000)\n",
39
+ " batch[\"speech\"] = speech_array\n",
40
+ " batch[\"sentence\"] = (\n",
41
+ " re.sub(\"([^\\u4e00-\\u9fa5\\u0030-\\u0039])\", \"\", batch[\"sentence\"]).lower() + \" \"\n",
42
+ " )\n",
43
+ " return batch\n",
44
+ "\n",
45
+ "\n",
46
+ "test_dataset = test_dataset.map(\n",
47
+ " speech_file_to_array_fn,\n",
48
+ " num_proc=15,\n",
49
+ " remove_columns=['client_id', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],\n",
50
+ ")\n",
51
+ "\n",
52
+ "# Preprocessing the datasets.\n",
53
+ "# We need to read the audio files as arrays\n",
54
+ "def evaluate(batch):\n",
55
+ " inputs = processor(\n",
56
+ " batch[\"speech\"], sampling_rate=16_000, return_tensors=\"pt\", padding=True\n",
57
+ " )\n",
58
+ "\n",
59
+ " with torch.no_grad():\n",
60
+ " logits = model(\n",
61
+ " inputs.input_values.to(DEVICE),\n",
62
+ " attention_mask=inputs.attention_mask.to(DEVICE),\n",
63
+ " ).logits\n",
64
+ "\n",
65
+ " pred_ids = torch.argmax(logits, dim=-1)\n",
66
+ " batch[\"pred_strings\"] = processor.batch_decode(pred_ids)\n",
67
+ " return batch\n",
68
+ "\n",
69
+ "\n",
70
+ "result = test_dataset.map(evaluate, batched=True, batch_size=8)\n",
71
+ "\n",
72
+ "predictions = [x.lower() for x in result[\"pred_strings\"]]\n",
73
+ "references = [x.lower() for x in result[\"sentence\"]]\n",
74
+ "\n",
75
+ "print(\n",
76
+ " f\"WER: {wer.compute(predictions=predictions, references=references, chunk_size=1000) * 100}\"\n",
77
+ ")\n",
78
+ "print(f\"CER: {cer.compute(predictions=predictions, references=references) * 100}\")"
79
+ ]
80
+ },
81
+ {
82
+ "cell_type": "code",
83
+ "execution_count": 15,
84
+ "id": "7db04701",
85
+ "metadata": {},
86
+ "outputs": [
87
+ {
88
+ "name": "stdout",
89
+ "output_type": "stream",
90
+ "text": [
91
+ "11/08/2022 09:41:20 - INFO - huggingsound.speech_recognition.model - Loading model...\n"
92
+ ]
93
+ },
94
+ {
95
+ "name": "stderr",
96
+ "output_type": "stream",
97
+ "text": [
98
+ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
99
+ ]
100
+ },
101
+ {
102
+ "name": "stdout",
103
+ "output_type": "stream",
104
+ "text": [
105
+ "11/08/2022 09:41:23 - WARNING - root - bos_token <s> not in provided tokens. It will be added to the list of tokens\n",
106
+ "11/08/2022 09:41:23 - WARNING - root - eos_token </s> not in provided tokens. It will be added to the list of tokens\n"
107
+ ]
108
+ },
109
+ {
110
+ "name": "stderr",
111
+ "output_type": "stream",
112
+ "text": [
113
+ "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 2.11it/s]\n"
114
+ ]
115
+ }
116
+ ],
117
+ "source": [
118
+ "from huggingsound import SpeechRecognitionModel\n",
119
+ "model = SpeechRecognitionModel(\"./wav2vec2-large-xlsr-chinese\")\n",
120
+ "audio_paths = [\"1.wav\"]\n",
121
+ "transcriptions = model.transcribe(audio_paths)"
122
+ ]
123
+ },
124
+ {
125
+ "cell_type": "code",
126
+ "execution_count": 19,
127
+ "id": "23316152",
128
+ "metadata": {},
129
+ "outputs": [
130
+ {
131
+ "data": {
132
+ "text/plain": [
133
+ "'δ½ ε–œζ¬’ι₯­ε—'"
134
+ ]
135
+ },
136
+ "execution_count": 19,
137
+ "metadata": {},
138
+ "output_type": "execute_result"
139
+ }
140
+ ],
141
+ "source": [
142
+ "# transcriptions[0]['transcription'].replace('[PAD]','')\n",
143
+ "transcriptions[0]['transcription']"
144
+ ]
145
+ },
146
+ {
147
+ "cell_type": "code",
148
+ "execution_count": 24,
149
+ "id": "730d4afa",
150
+ "metadata": {},
151
+ "outputs": [],
152
+ "source": [
153
+ "import torch\n",
154
+ "from transformers import Wav2Vec2Processor, HubertForCTC\n",
155
+ "from datasets import load_dataset\n",
156
+ "\n",
157
+ "processor = Wav2Vec2Processor.from_pretrained(\"./english_fine_tune\")\n",
158
+ "model = HubertForCTC.from_pretrained(\"./english_fine_tune\")"
159
+ ]
160
+ },
161
+ {
162
+ "cell_type": "code",
163
+ "execution_count": 25,
164
+ "id": "f45768e8",
165
+ "metadata": {},
166
+ "outputs": [
167
+ {
168
+ "name": "stderr",
169
+ "output_type": "stream",
170
+ "text": [
171
+ "It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.\n"
172
+ ]
173
+ }
174
+ ],
175
+ "source": [
176
+ "import librosa\n",
177
+ "input_audio, sr = librosa.load('english.wav', sr = 16000)\n",
178
+ "input_values = processor(input_audio, return_tensors=\"pt\").input_values # Batch size 1\n",
179
+ "logits = model(input_values).logits\n",
180
+ "predicted_ids = torch.argmax(logits, dim=-1)\n",
181
+ "transcription = processor.decode(predicted_ids[0])"
182
+ ]
183
+ },
184
+ {
185
+ "cell_type": "code",
186
+ "execution_count": 26,
187
+ "id": "8bd98a38",
188
+ "metadata": {},
189
+ "outputs": [
190
+ {
191
+ "data": {
192
+ "text/plain": [
193
+ "'WITHOUT THE DATA SET THE ARTICLE IS USELESS'"
194
+ ]
195
+ },
196
+ "execution_count": 26,
197
+ "metadata": {},
198
+ "output_type": "execute_result"
199
+ }
200
+ ],
201
+ "source": [
202
+ "transcription"
203
+ ]
204
+ },
205
+ {
206
+ "cell_type": "code",
207
+ "execution_count": null,
208
+ "id": "db6a5667",
209
+ "metadata": {},
210
+ "outputs": [],
211
+ "source": []
212
+ }
213
+ ],
214
+ "metadata": {
215
+ "kernelspec": {
216
+ "display_name": "Python 3 (ipykernel)",
217
+ "language": "python",
218
+ "name": "python3"
219
+ },
220
+ "language_info": {
221
+ "codemirror_mode": {
222
+ "name": "ipython",
223
+ "version": 3
224
+ },
225
+ "file_extension": ".py",
226
+ "mimetype": "text/x-python",
227
+ "name": "python",
228
+ "nbconvert_exporter": "python",
229
+ "pygments_lexer": "ipython3",
230
+ "version": "3.8.13"
231
+ }
232
+ },
233
+ "nbformat": 4,
234
+ "nbformat_minor": 5
235
+ }
1.wav ADDED
Binary file (203 kB). View file
 
Fine_Tune_XLSR_Wav2Vec2.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Pipeline.ipynb ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 41,
6
+ "id": "edc2e2ff",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import librosa\n",
11
+ "import torch\n",
12
+ "from transformers import Wav2Vec2Processor, HubertForCTC\n",
13
+ "from huggingsound import SpeechRecognitionModel\n",
14
+ "import torchaudio\n",
15
+ "from speechbrain.pretrained import EncoderClassifier\n",
16
+ "import time\n",
17
+ "from transformers import Pipeline"
18
+ ]
19
+ },
20
+ {
21
+ "cell_type": "code",
22
+ "execution_count": 19,
23
+ "id": "76f25cc3",
24
+ "metadata": {},
25
+ "outputs": [
26
+ {
27
+ "name": "stdout",
28
+ "output_type": "stream",
29
+ "text": [
30
+ "11/08/2022 14:17:47 - INFO - huggingsound.speech_recognition.model - Loading model...\n"
31
+ ]
32
+ },
33
+ {
34
+ "name": "stderr",
35
+ "output_type": "stream",
36
+ "text": [
37
+ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
38
+ ]
39
+ },
40
+ {
41
+ "name": "stdout",
42
+ "output_type": "stream",
43
+ "text": [
44
+ "11/08/2022 14:17:49 - WARNING - root - bos_token <s> not in provided tokens. It will be added to the list of tokens\n",
45
+ "11/08/2022 14:17:49 - WARNING - root - eos_token </s> not in provided tokens. It will be added to the list of tokens\n"
46
+ ]
47
+ }
48
+ ],
49
+ "source": [
50
+ "model_chinese = SpeechRecognitionModel(\"./wav2vec2-large-xlsr-chinese\")\n",
51
+ "processor = Wav2Vec2Processor.from_pretrained(\"./english_fine_tune\")\n",
52
+ "model = HubertForCTC.from_pretrained(\"./english_fine_tune\")\n",
53
+ "language_id = EncoderClassifier.from_hparams(source=\"speechbrain/lang-id-voxlingua107-ecapa\", savedir=\"tmp\")"
54
+ ]
55
+ },
56
+ {
57
+ "cell_type": "code",
58
+ "execution_count": 38,
59
+ "id": "3b142546",
60
+ "metadata": {},
61
+ "outputs": [],
62
+ "source": [
63
+ "def pipeline(path_to_audio):\n",
64
+ " signal = language_id.load_audio(path_to_audio)\n",
65
+ " prediction = language_id.classify_batch(signal)\n",
66
+ " prediction[3]\n",
67
+ " \n",
68
+ " if prediction[3][0] == 'zh: Chinese':\n",
69
+ " print('Detected Language is Chinese')\n",
70
+ " transcriptions = model_chinese.transcribe([path_to_audio])\n",
71
+ " print(transcriptions[0]['transcription'])\n",
72
+ " else:\n",
73
+ " print('Detected language is English')\n",
74
+ " input_audio, sr = librosa.load(path_to_audio, sr = 16000)\n",
75
+ " input_values = processor(input_audio, return_tensors=\"pt\").input_values \n",
76
+ " logits = model(input_values).logits\n",
77
+ " predicted_ids = torch.argmax(logits, dim=-1)\n",
78
+ " transcription = processor.decode(predicted_ids[0])\n",
79
+ " print(transcription)\n",
80
+ " "
81
+ ]
82
+ },
83
+ {
84
+ "cell_type": "code",
85
+ "execution_count": 39,
86
+ "id": "48bed0f8",
87
+ "metadata": {},
88
+ "outputs": [
89
+ {
90
+ "name": "stderr",
91
+ "output_type": "stream",
92
+ "text": [
93
+ "It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.\n"
94
+ ]
95
+ },
96
+ {
97
+ "name": "stdout",
98
+ "output_type": "stream",
99
+ "text": [
100
+ "Detected language is English\n",
101
+ "WITHOUT THE DATA SET THE ARTICLE IS USELESS\n"
102
+ ]
103
+ }
104
+ ],
105
+ "source": [
106
+ "class Speech_to_Text(Pipeline):\n",
107
+ " def postprocess(self,model_outputs):\n",
108
+ " if prediction[3][0] == 'zh: Chinese':\n",
109
+ " "
110
+ ]
111
+ },
112
+ {
113
+ "cell_type": "code",
114
+ "execution_count": 51,
115
+ "id": "b0fae1dd",
116
+ "metadata": {},
117
+ "outputs": [
118
+ {
119
+ "name": "stderr",
120
+ "output_type": "stream",
121
+ "text": [
122
+ "It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.\n"
123
+ ]
124
+ },
125
+ {
126
+ "name": "stdout",
127
+ "output_type": "stream",
128
+ "text": [
129
+ "Detected language is English\n",
130
+ "WITHOUT THE DATA SET THE ARTICLE IS USELESS\n"
131
+ ]
132
+ }
133
+ ],
134
+ "source": [
135
+ "start = time.time()\n",
136
+ "pipeline('english.wav')\n",
137
+ "end = time.time()"
138
+ ]
139
+ },
140
+ {
141
+ "cell_type": "code",
142
+ "execution_count": 49,
143
+ "id": "1e0321b5",
144
+ "metadata": {},
145
+ "outputs": [
146
+ {
147
+ "data": {
148
+ "text/plain": [
149
+ "0.5424931049346924"
150
+ ]
151
+ },
152
+ "execution_count": 49,
153
+ "metadata": {},
154
+ "output_type": "execute_result"
155
+ }
156
+ ],
157
+ "source": [
158
+ "end - start"
159
+ ]
160
+ },
161
+ {
162
+ "cell_type": "code",
163
+ "execution_count": null,
164
+ "id": "a069a0fd",
165
+ "metadata": {},
166
+ "outputs": [],
167
+ "source": []
168
+ }
169
+ ],
170
+ "metadata": {
171
+ "kernelspec": {
172
+ "display_name": "Python 3 (ipykernel)",
173
+ "language": "python",
174
+ "name": "python3"
175
+ },
176
+ "language_info": {
177
+ "codemirror_mode": {
178
+ "name": "ipython",
179
+ "version": 3
180
+ },
181
+ "file_extension": ".py",
182
+ "mimetype": "text/x-python",
183
+ "name": "python",
184
+ "nbconvert_exporter": "python",
185
+ "pygments_lexer": "ipython3",
186
+ "version": "3.8.13"
187
+ }
188
+ },
189
+ "nbformat": 4,
190
+ "nbformat_minor": 5
191
+ }
Testing_Models.ipynb ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "ac7631cc",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import torch\n",
11
+ "import re\n",
12
+ "import librosa\n",
13
+ "from datasets import load_dataset, load_metric\n",
14
+ "from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor\n",
15
+ "import warnings\n",
16
+ "import os\n",
17
+ "\n",
18
+ "\n",
19
+ "LANG_ID = \"zh-CN\"\n",
20
+ "MODEL_ID = \"zh-CN-output-aishell\"\n",
21
+ "\n",
22
+ "test_dataset = load_dataset(\"common_voice\", LANG_ID, split=\"test\")\n",
23
+ "\n",
24
+ "wer = load_metric(\"wer\")\n",
25
+ "cer = load_metric(\"cer\")\n",
26
+ "\n",
27
+ "\n",
28
+ "\n",
29
+ "processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)\n",
30
+ "model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)\n",
31
+ "model.to(DEVICE)\n",
32
+ "\n",
33
+ "# Preprocessing the datasets.\n",
34
+ "# We need to read the audio files as arrays\n",
35
+ "def speech_file_to_array_fn(batch):\n",
36
+ " with warnings.catch_warnings():\n",
37
+ " warnings.simplefilter(\"ignore\")\n",
38
+ " speech_array, sampling_rate = librosa.load(batch[\"path\"], sr=16_000)\n",
39
+ " batch[\"speech\"] = speech_array\n",
40
+ " batch[\"sentence\"] = (\n",
41
+ " re.sub(\"([^\\u4e00-\\u9fa5\\u0030-\\u0039])\", \"\", batch[\"sentence\"]).lower() + \" \"\n",
42
+ " )\n",
43
+ " return batch\n",
44
+ "\n",
45
+ "\n",
46
+ "test_dataset = test_dataset.map(\n",
47
+ " speech_file_to_array_fn,\n",
48
+ " num_proc=15,\n",
49
+ " remove_columns=['client_id', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],\n",
50
+ ")\n",
51
+ "\n",
52
+ "# Preprocessing the datasets.\n",
53
+ "# We need to read the audio files as arrays\n",
54
+ "def evaluate(batch):\n",
55
+ " inputs = processor(\n",
56
+ " batch[\"speech\"], sampling_rate=16_000, return_tensors=\"pt\", padding=True\n",
57
+ " )\n",
58
+ "\n",
59
+ " with torch.no_grad():\n",
60
+ " logits = model(\n",
61
+ " inputs.input_values.to(DEVICE),\n",
62
+ " attention_mask=inputs.attention_mask.to(DEVICE),\n",
63
+ " ).logits\n",
64
+ "\n",
65
+ " pred_ids = torch.argmax(logits, dim=-1)\n",
66
+ " batch[\"pred_strings\"] = processor.batch_decode(pred_ids)\n",
67
+ " return batch\n",
68
+ "\n",
69
+ "\n",
70
+ "result = test_dataset.map(evaluate, batched=True, batch_size=8)\n",
71
+ "\n",
72
+ "predictions = [x.lower() for x in result[\"pred_strings\"]]\n",
73
+ "references = [x.lower() for x in result[\"sentence\"]]\n",
74
+ "\n",
75
+ "print(\n",
76
+ " f\"WER: {wer.compute(predictions=predictions, references=references, chunk_size=1000) * 100}\"\n",
77
+ ")\n",
78
+ "print(f\"CER: {cer.compute(predictions=predictions, references=references) * 100}\")"
79
+ ]
80
+ },
81
+ {
82
+ "cell_type": "code",
83
+ "execution_count": 15,
84
+ "id": "7db04701",
85
+ "metadata": {},
86
+ "outputs": [
87
+ {
88
+ "name": "stdout",
89
+ "output_type": "stream",
90
+ "text": [
91
+ "11/08/2022 09:41:20 - INFO - huggingsound.speech_recognition.model - Loading model...\n"
92
+ ]
93
+ },
94
+ {
95
+ "name": "stderr",
96
+ "output_type": "stream",
97
+ "text": [
98
+ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
99
+ ]
100
+ },
101
+ {
102
+ "name": "stdout",
103
+ "output_type": "stream",
104
+ "text": [
105
+ "11/08/2022 09:41:23 - WARNING - root - bos_token <s> not in provided tokens. It will be added to the list of tokens\n",
106
+ "11/08/2022 09:41:23 - WARNING - root - eos_token </s> not in provided tokens. It will be added to the list of tokens\n"
107
+ ]
108
+ },
109
+ {
110
+ "name": "stderr",
111
+ "output_type": "stream",
112
+ "text": [
113
+ "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 2.11it/s]\n"
114
+ ]
115
+ }
116
+ ],
117
+ "source": [
118
+ "from huggingsound import SpeechRecognitionModel\n",
119
+ "model = SpeechRecognitionModel(\"./wav2vec2-large-xlsr-chinese\")\n",
120
+ "audio_paths = [\"1.wav\"]\n",
121
+ "transcriptions = model.transcribe(audio_paths)"
122
+ ]
123
+ },
124
+ {
125
+ "cell_type": "code",
126
+ "execution_count": 19,
127
+ "id": "23316152",
128
+ "metadata": {},
129
+ "outputs": [
130
+ {
131
+ "data": {
132
+ "text/plain": [
133
+ "'δ½ ε–œζ¬’ι₯­ε—'"
134
+ ]
135
+ },
136
+ "execution_count": 19,
137
+ "metadata": {},
138
+ "output_type": "execute_result"
139
+ }
140
+ ],
141
+ "source": [
142
+ "# transcriptions[0]['transcription'].replace('[PAD]','')\n",
143
+ "transcriptions[0]['transcription']"
144
+ ]
145
+ },
146
+ {
147
+ "cell_type": "code",
148
+ "execution_count": 24,
149
+ "id": "730d4afa",
150
+ "metadata": {},
151
+ "outputs": [],
152
+ "source": [
153
+ "import torch\n",
154
+ "from transformers import Wav2Vec2Processor, HubertForCTC\n",
155
+ "from datasets import load_dataset\n",
156
+ "\n",
157
+ "processor = Wav2Vec2Processor.from_pretrained(\"./english_fine_tune\")\n",
158
+ "model = HubertForCTC.from_pretrained(\"./english_fine_tune\")"
159
+ ]
160
+ },
161
+ {
162
+ "cell_type": "code",
163
+ "execution_count": 25,
164
+ "id": "f45768e8",
165
+ "metadata": {},
166
+ "outputs": [
167
+ {
168
+ "name": "stderr",
169
+ "output_type": "stream",
170
+ "text": [
171
+ "It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.\n"
172
+ ]
173
+ }
174
+ ],
175
+ "source": [
176
+ "import librosa\n",
177
+ "input_audio, sr = librosa.load('english.wav', sr = 16000)\n",
178
+ "input_values = processor(input_audio, return_tensors=\"pt\").input_values # Batch size 1\n",
179
+ "logits = model(input_values).logits\n",
180
+ "predicted_ids = torch.argmax(logits, dim=-1)\n",
181
+ "transcription = processor.decode(predicted_ids[0])"
182
+ ]
183
+ },
184
+ {
185
+ "cell_type": "code",
186
+ "execution_count": 26,
187
+ "id": "8bd98a38",
188
+ "metadata": {},
189
+ "outputs": [
190
+ {
191
+ "data": {
192
+ "text/plain": [
193
+ "'WITHOUT THE DATA SET THE ARTICLE IS USELESS'"
194
+ ]
195
+ },
196
+ "execution_count": 26,
197
+ "metadata": {},
198
+ "output_type": "execute_result"
199
+ }
200
+ ],
201
+ "source": [
202
+ "transcription"
203
+ ]
204
+ },
205
+ {
206
+ "cell_type": "code",
207
+ "execution_count": null,
208
+ "id": "db6a5667",
209
+ "metadata": {},
210
+ "outputs": [],
211
+ "source": []
212
+ }
213
+ ],
214
+ "metadata": {
215
+ "kernelspec": {
216
+ "display_name": "Python 3 (ipykernel)",
217
+ "language": "python",
218
+ "name": "python3"
219
+ },
220
+ "language_info": {
221
+ "codemirror_mode": {
222
+ "name": "ipython",
223
+ "version": 3
224
+ },
225
+ "file_extension": ".py",
226
+ "mimetype": "text/x-python",
227
+ "name": "python",
228
+ "nbconvert_exporter": "python",
229
+ "pygments_lexer": "ipython3",
230
+ "version": "3.8.13"
231
+ }
232
+ },
233
+ "nbformat": 4,
234
+ "nbformat_minor": 5
235
+ }
english.wav ADDED
Binary file (309 kB). View file
 
english_fine_tune ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit ece5fabbf034c1073acae96d5401b25be96709d8
lang-id-voxlingua107-ecapa ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit d771b530cec097adc0088b4dbd173e242f895464
wav2vec2-large-xlsr-chinese ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 369f73139f85a98570ff74e641dc93d421a3860e