Francis0917 commited on
Commit
2045faa
1 Parent(s): 55d46a2

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -0
  2. README.md +111 -8
  3. checkpoint_results/checkpoint_gctc_clap/20240725-154258/checkpoint +2 -0
  4. checkpoint_results/checkpoint_gctc_clap/20240725-154258/ckpt-29.data-00000-of-00001 +3 -0
  5. checkpoint_results/checkpoint_gctc_clap/20240725-154258/ckpt-29.index +0 -0
  6. checkpoint_results/checkpoint_guided_ctc/20240725-011006/checkpoint +2 -0
  7. checkpoint_results/checkpoint_guided_ctc/20240725-011006/ckpt-23.data-00000-of-00001 +3 -0
  8. checkpoint_results/checkpoint_guided_ctc/20240725-011006/ckpt-23.index +0 -0
  9. criterion/__pycache__/total.cpython-37.pyc +0 -0
  10. criterion/__pycache__/total_ctc1_clap.cpython-37.pyc +0 -0
  11. criterion/__pycache__/utils.cpython-37.pyc +0 -0
  12. criterion/total.py +69 -0
  13. criterion/total_CLKWS.py +100 -0
  14. criterion/total_ctc1.py +97 -0
  15. criterion/total_ctc1_clap.py +125 -0
  16. criterion/utils.py +32 -0
  17. dataset/__pycache__/dataloader_demo.cpython-37.pyc +0 -0
  18. dataset/__pycache__/dataloader_infe.cpython-37.pyc +0 -0
  19. dataset/__pycache__/google.cpython-37.pyc +0 -0
  20. dataset/__pycache__/google_infe202405.cpython-37.pyc +0 -0
  21. dataset/__pycache__/libriphrase.cpython-37.pyc +0 -0
  22. dataset/__pycache__/libriphrase_ctc1.cpython-37.pyc +0 -0
  23. dataset/__pycache__/qualcomm.cpython-37.pyc +0 -0
  24. dataset/dataloader_demo.py +182 -0
  25. dataset/dataloader_infe.py +164 -0
  26. dataset/g2p/LICENSE.txt +201 -0
  27. dataset/g2p/g2p_en/__init__.py +1 -0
  28. dataset/g2p/g2p_en/__pycache__/__init__.cpython-37.pyc +0 -0
  29. dataset/g2p/g2p_en/__pycache__/expand.cpython-37.pyc +0 -0
  30. dataset/g2p/g2p_en/__pycache__/g2p.cpython-37.pyc +0 -0
  31. dataset/g2p/g2p_en/checkpoint20.npz +3 -0
  32. dataset/g2p/g2p_en/expand.py +79 -0
  33. dataset/g2p/g2p_en/g2p.py +249 -0
  34. dataset/g2p/g2p_en/homographs.en +379 -0
  35. dataset/google.py +188 -0
  36. dataset/google_infe202405.py +192 -0
  37. dataset/libriphrase.py +331 -0
  38. dataset/libriphrase_ctc1.py +346 -0
  39. dataset/qualcomm.py +180 -0
  40. demo.py +168 -0
  41. docker/Dockerfile +25 -0
  42. flagged/Sound/c129aef35ba4cb66620f813cd7268c4be510a66d/ok_google-183000.wav +0 -0
  43. flagged/Sound/d35a5cf80a9403828bc601a0a761a5f88da06f00/realtek_go-183033.wav +0 -0
  44. flagged/log.csv +8 -0
  45. inference.py +141 -0
  46. model/__pycache__/discriminator.cpython-37.pyc +0 -0
  47. model/__pycache__/encoder.cpython-37.pyc +0 -0
  48. model/__pycache__/extractor.cpython-37.pyc +0 -0
  49. model/__pycache__/log_melspectrogram.cpython-37.pyc +0 -0
  50. model/__pycache__/speech_embedding.cpython-37.pyc +0 -0
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoint_results/checkpoint_gctc_clap/20240725-154258/ckpt-29.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
37
+ checkpoint_results/checkpoint_guided_ctc/20240725-011006/ckpt-23.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
38
+ model/google_speech_embedding/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,12 +1,115 @@
1
  ---
2
- title: CL-KWS 202408 V1
3
- emoji: 📈
4
- colorFrom: blue
5
- colorTo: green
6
  sdk: gradio
7
- sdk_version: 4.44.0
8
- app_file: app.py
9
- pinned: false
10
  ---
 
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: CL-KWS_202408_v1
3
+ app_file: demo.py
 
 
4
  sdk: gradio
5
+ sdk_version: 3.34.0
 
 
6
  ---
7
+ ### Datasets
8
 
9
+ * [LibriPhrase]
10
+ LibriSpeech corpus : https://www.openslr.org/12
11
+ Recipe for LibriPhrase : https://github.com/gusrud1103/LibriPhrase
12
+
13
+ * [Google Speech Commands]
14
+ http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz
15
+ http://download.tensorflow.org/data/speech_commands_test_set_v0.02.tar.gz
16
+ https://www.tensorflow.org/datasets/catalog/speech_commands
17
+
18
+ * [Qualcomm Keyword Speech]
19
+ https://www.qualcomm.com/developer/software/keyword-speech-dataset
20
+
21
+ *[noise][musan]
22
+ https://www.openslr.org/17/
23
+
24
+ ## Getting started
25
+
26
+ ### Environment
27
+
28
+ ```bash
29
+ #python=3.7
30
+ conda create --name [name] python=3.7
31
+ conda install -c "nvidia/label/cuda-11.6.0" cuda-nvcc
32
+ conda install -c conda-forge cudnn=8.2.1.32
33
+ pip install -r requirements.txt
34
+ pip install numpy==1.18.5
35
+ pip install tensorflow-model-optimization==0.6.0
36
+ cd /miniconda3/envs/[name]/lib
37
+ ln -s libcusolver.so.11 libcusolver.so.10
38
+ # export export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/share/homes/yiting/miniconda3/envs/pho/lib
39
+ ```
40
+
41
+ ### Training
42
+ ```bash
43
+ python train_guided_CTC.py\
44
+ --epoch 23 \
45
+ --lr 1e-3 \
46
+ --loss_weight 1.0 1.0 0.2\
47
+ --audio_input both \
48
+ --text_input phoneme \
49
+ --comment 'user comments for each experiment'
50
+ ```
51
+
52
+ ```bash
53
+ python train.py \
54
+ --epoch 18 \
55
+ --lr 1e-3 \
56
+ --loss_weight 1.0 1.0 \
57
+ --audio_input both \
58
+ --text_input phoneme \
59
+ --comment 'user comments for each experiment'
60
+ ```
61
+
62
+ ### Fine-tuning
63
+ checkpoint: ./checkpoint_results/checkpoint_guided_ctc/20240725-011006
64
+ ```bash
65
+ python train_guided_ctc_clap.py \
66
+ --epoch 5 \
67
+ --lr 1e-3 \
68
+ --loss_weight 1.0 1.0 0.01 0.01 \
69
+ --audio_input both \
70
+ --text_input phoneme \
71
+ --load_checkpoint_path '/home/DB/checkpoint_results/checkpoint_guided_ctc/date-time' \
72
+ --comment 'user comments for each experiment'
73
+ ```
74
+
75
+ ```bash
76
+ python train_CLKWS.py \
77
+ --epoch 4 \
78
+ --lr 1e-3 \
79
+ --loss_weight 1.0 1.0 \
80
+ --audio_input both \
81
+ --text_input phoneme \
82
+ --load_checkpoint_path '/home/DB/checkpoint_results/checkpoint/date-time' \
83
+ --comment 'user comments for each experiment'
84
+ ```
85
+
86
+ ### Inference
87
+ keyword list is target_list in google_infe202405.py
88
+
89
+ ```bash
90
+ python inference.py --audio_input both --text_input phoneme --load_checkpoint_path 'home/DB/checkpoint_results/checkpoint/20240515-111757'
91
+ ```
92
+
93
+
94
+ ### Demo
95
+ checkpoint:checkpoint: ./checkpoint_results/checkpoint_guided_ctc/20240725-011006
96
+ ./checkpoint_results/checkpoint_gctc_clap/20240725-154258
97
+
98
+ ```bash
99
+ python demo.py --audio_input both --text_input phoneme --load_checkpoint_path '/home/DB/checkpoint_results/checkpoint_guided_ctc/20240725-011006' --keyword_list_length 8
100
+ ```
101
+
102
+ Demo website :Running on public URL
103
+ upload file: MONO, WAV, 256kbps, 22050hz
104
+ dataset/dataloader_demo.py : self.maxlen_a = 56000
105
+
106
+
107
+ ### Monitoring
108
+
109
+ ```bash
110
+ tensorboard --logdir ./log/ --bind_all
111
+ ```
112
+
113
+ ### Acknownoledge
114
+ We acknowledge the following code repositories:
115
+ https://github.com/ncsoft/PhonMatchNet
checkpoint_results/checkpoint_gctc_clap/20240725-154258/checkpoint ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ model_checkpoint_path: "ckpt-29"
2
+ all_model_checkpoint_paths: "ckpt-29"
checkpoint_results/checkpoint_gctc_clap/20240725-154258/ckpt-29.data-00000-of-00001 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25da31f91bcff94540bf57296b058d07aaaa804c85ad59d5eaf9bc3f9803c62f
3
+ size 1211835
checkpoint_results/checkpoint_gctc_clap/20240725-154258/ckpt-29.index ADDED
Binary file (2.23 kB). View file
 
checkpoint_results/checkpoint_guided_ctc/20240725-011006/checkpoint ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ model_checkpoint_path: "ckpt-23"
2
+ all_model_checkpoint_paths: "ckpt-23"
checkpoint_results/checkpoint_guided_ctc/20240725-011006/ckpt-23.data-00000-of-00001 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0228d6d9c71e767409ff8d2a300eda7d8d115185d3b793699cae730715424aa
3
+ size 3630878
checkpoint_results/checkpoint_guided_ctc/20240725-011006/ckpt-23.index ADDED
Binary file (6.37 kB). View file
 
criterion/__pycache__/total.cpython-37.pyc ADDED
Binary file (2.78 kB). View file
 
criterion/__pycache__/total_ctc1_clap.cpython-37.pyc ADDED
Binary file (4.29 kB). View file
 
criterion/__pycache__/utils.cpython-37.pyc ADDED
Binary file (1.52 kB). View file
 
criterion/total.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, sys
2
+ import tensorflow as tf
3
+ import numpy as np
4
+ from tensorflow.keras.losses import Loss, MeanSquaredError
5
+
6
+ seed = 42
7
+ tf.random.set_seed(seed)
8
+ np.random.seed(seed)
9
+
10
+ def sequence_cross_entropy(speech_label, text_label, logits, reduction='sum'):
11
+ """
12
+ args
13
+ speech_label : [B, Ls]
14
+ text_label : [B, Lt]
15
+ logits : [B, Lt]
16
+ logits._keras_mask : [B, Lt]
17
+ """
18
+ # Data pre-processing
19
+ if tf.shape(text_label)[1] > tf.shape(speech_label)[1]:
20
+ speech_label = tf.pad(speech_label, [[0, 0],[0, tf.shape(text_label)[1] - tf.shape(speech_label)[1]]], 'CONSTANT', constant_values=0)
21
+ elif tf.shape(text_label)[1] < tf.shape(speech_label)[1]:
22
+ speech_label = speech_label[:, :text_label.shape[1]]
23
+
24
+ # Make paired data between text and speech phonemes
25
+ paired_label = tf.math.equal(text_label, speech_label)
26
+ paired_label = tf.cast(tf.math.logical_and(tf.cast(paired_label, tf.bool), tf.cast(logits._keras_mask, tf.bool)), tf.float32)
27
+ paired_label = tf.reshape(tf.ragged.boolean_mask(paired_label, tf.cast(logits._keras_mask, tf.bool)).flat_values, [-1,1])
28
+ logits = tf.reshape(tf.ragged.boolean_mask(logits, tf.cast(logits._keras_mask, tf.bool)).flat_values, [-1,1])
29
+
30
+ # Get BinaryCrossEntropy loss
31
+ BCE = tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.SUM)
32
+ loss = BCE(paired_label, logits)
33
+
34
+ if reduction == 'sum':
35
+ loss = tf.math.divide_no_nan(loss, tf.cast(tf.shape(logits)[0], loss.dtype))
36
+ loss = tf.math.multiply_no_nan(loss, tf.cast(tf.shape(speech_label)[0], loss.dtype))
37
+
38
+ return loss
39
+
40
+ def detection_loss(y_true, y_pred):
41
+ BFC = tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.SUM)
42
+ return(BFC(y_true, y_pred))
43
+
44
+ class TotalLoss(Loss):
45
+ def __init__(self, weight=1.0):
46
+ super().__init__()
47
+ self.weight = weight
48
+
49
+ def __call__(self, y_true, y_pred, reduction='sum'):
50
+ LD = detection_loss(y_true, y_pred)
51
+
52
+ return self.weight * LD, LD
53
+
54
+
55
+ class TotalLoss_SCE(Loss):
56
+ def __init__(self, weight=[1.0, 1.0]):
57
+ super().__init__()
58
+ self.weight = weight
59
+
60
+ def __call__(self, y_true, y_pred, speech_label, text_label, logit, reduction='sum'):
61
+ if self.weight[0] != 0.0:
62
+ LD = detection_loss(y_true, y_pred)
63
+ else:
64
+ LD = 0
65
+ if self.weight[1] != 0.0:
66
+ LC = sequence_cross_entropy(speech_label, text_label, logit, reduction=reduction)
67
+ else:
68
+ LC = 0
69
+ return self.weight[0] * LD + self.weight[1] * LC, LD, LC
criterion/total_CLKWS.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, sys
2
+ import tensorflow as tf
3
+ import numpy as np
4
+ from tensorflow.keras.losses import Loss, MeanSquaredError
5
+ import math
6
+ seed = 42
7
+ tf.random.set_seed(seed)
8
+ np.random.seed(seed)
9
+
10
+ def sequence_cross_entropy(speech_label, text_label, logits, reduction='sum'):
11
+ """
12
+ args
13
+ speech_label : [B, Ls]
14
+ text_label : [B, Lt]
15
+ logits : [B, Lt]
16
+ logits._keras_mask : [B, Lt]
17
+ """
18
+ # Data pre-processing
19
+ if tf.shape(text_label)[1] > tf.shape(speech_label)[1]:
20
+ speech_label = tf.pad(speech_label, [[0, 0],[0, tf.shape(text_label)[1] - tf.shape(speech_label)[1]]], 'CONSTANT', constant_values=0)
21
+ elif tf.shape(text_label)[1] < tf.shape(speech_label)[1]:
22
+ speech_label = speech_label[:, :text_label.shape[1]]
23
+
24
+ # Make paired data between text and speech phonemes
25
+ paired_label = tf.math.equal(text_label, speech_label)
26
+ paired_label = tf.cast(tf.math.logical_and(tf.cast(paired_label, tf.bool), tf.cast(logits._keras_mask, tf.bool)), tf.float32)
27
+ paired_label = tf.reshape(tf.ragged.boolean_mask(paired_label, tf.cast(logits._keras_mask, tf.bool)).flat_values, [-1,1])
28
+ logits = tf.reshape(tf.ragged.boolean_mask(logits, tf.cast(logits._keras_mask, tf.bool)).flat_values, [-1,1])
29
+
30
+ # Get BinaryCrossEntropy loss
31
+ BCE = tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.SUM)
32
+ loss = BCE(paired_label, logits)
33
+
34
+ if reduction == 'sum':
35
+ loss = tf.math.divide_no_nan(loss, tf.cast(tf.shape(logits)[0], loss.dtype))
36
+ loss = tf.math.multiply_no_nan(loss, tf.cast(tf.shape(speech_label)[0], loss.dtype))
37
+
38
+ return loss
39
+
40
+ def detection_loss(y_true, y_pred):
41
+ BFC = tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.SUM)
42
+ return(BFC(y_true, y_pred))
43
+
44
+ def matrix_loss_0(y_true, y_pred):
45
+ MBC_0 = tf.keras.losses.CategoricalCrossentropy(from_logits=True,reduction=tf.keras.losses.Reduction.SUM)
46
+ return(MBC_0(y_true, y_pred))
47
+
48
+ def matrix_loss_1(y_true, y_pred):
49
+ MBC_1 = tf.keras.losses.CategoricalCrossentropy(from_logits=True,reduction=tf.keras.losses.Reduction.SUM)
50
+ return(MBC_1(y_true, y_pred))
51
+
52
+
53
+ class TotalLoss(Loss):
54
+ def __init__(self, weight=1.0):
55
+ super().__init__()
56
+ self.weight = weight
57
+
58
+ def __call__(self, y_true, y_pred, reduction='sum'):
59
+ LD = detection_loss(y_true, y_pred)
60
+
61
+ return self.weight * LD, LD
62
+
63
+
64
+ class TotalLoss_SCE(Loss):
65
+ def __init__(self, weight=[1.0, 1.0]):
66
+ super().__init__()
67
+ self.weight = weight
68
+
69
+ def __call__(self, y_true, y_pred, speech_label, text_label, logit, prob, reduction='sum'):
70
+ if self.weight[0] != 0.0:
71
+ LD = detection_loss(y_true, y_pred)
72
+ else:
73
+ LD = 0
74
+ if self.weight[1] != 0.0:
75
+ LC = sequence_cross_entropy(speech_label, text_label, logit, reduction=reduction)
76
+ else:
77
+ LC = 0
78
+
79
+
80
+ number_1 = 5
81
+ number_2 = int(y_pred.shape[0]//number_1)
82
+ number_3 = int(y_pred.shape[0]//(number_1*number_1))
83
+
84
+ y_pred_1 = tf.reshape(prob,[number_2,number_1])
85
+ y_true_1 = tf.reshape(y_true,[number_2,number_1])
86
+
87
+ loss_audio = matrix_loss_0(y_true_1,y_pred_1)
88
+
89
+ x=tf.reshape(prob,[number_3,number_1,number_1])
90
+ x_transposed = tf.transpose(x, perm=[0, 2, 1])
91
+ y_pred_2 = tf.reshape(x_transposed,[number_2,number_1])
92
+ y = tf.reshape(y_true,[number_3,number_1,number_1])
93
+ y_transposed = tf.transpose(y,perm=[0, 2, 1])
94
+ y_true_2 = tf.reshape(y_transposed,[number_2,number_1])
95
+ loss_text = matrix_loss_1(y_true_2,y_pred_2)
96
+ loss = 0.5*loss_audio + 0.5*loss_text
97
+
98
+
99
+
100
+ return self.weight[0] * LD + self.weight[1] * LC + loss, LD, LC
criterion/total_ctc1.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, sys
2
+ import tensorflow as tf
3
+ import numpy as np
4
+ from tensorflow.keras.losses import Loss, MeanSquaredError
5
+
6
+ seed = 42
7
+ tf.random.set_seed(seed)
8
+ np.random.seed(seed)
9
+
10
+ def sequence_cross_entropy(speech_label, text_label, logits, reduction='sum'):
11
+ """
12
+ args
13
+ speech_label : [B, Ls]
14
+ text_label : [B, Lt]
15
+ logits : [B, Lt]
16
+ logits._keras_mask : [B, Lt]
17
+ """
18
+ # Data pre-processing
19
+ if tf.shape(text_label)[1] > tf.shape(speech_label)[1]:
20
+ speech_label = tf.pad(speech_label, [[0, 0],[0, tf.shape(text_label)[1] - tf.shape(speech_label)[1]]], 'CONSTANT', constant_values=0)
21
+ elif tf.shape(text_label)[1] < tf.shape(speech_label)[1]:
22
+ speech_label = speech_label[:, :text_label.shape[1]]
23
+
24
+ # Make paired data between text and speech phonemes
25
+ paired_label = tf.math.equal(text_label, speech_label)
26
+ paired_label = tf.cast(tf.math.logical_and(tf.cast(paired_label, tf.bool), tf.cast(logits._keras_mask, tf.bool)), tf.float32)
27
+ paired_label = tf.reshape(tf.ragged.boolean_mask(paired_label, tf.cast(logits._keras_mask, tf.bool)).flat_values, [-1,1])
28
+ logits = tf.reshape(tf.ragged.boolean_mask(logits, tf.cast(logits._keras_mask, tf.bool)).flat_values, [-1,1])
29
+
30
+ # Get BinaryCrossEntropy loss
31
+ BCE = tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.SUM)
32
+ loss = BCE(paired_label, logits)
33
+
34
+ if reduction == 'sum':
35
+ loss = tf.math.divide_no_nan(loss, tf.cast(tf.shape(logits)[0], loss.dtype))
36
+ loss = tf.math.multiply_no_nan(loss, tf.cast(tf.shape(speech_label)[0], loss.dtype))
37
+
38
+ return loss
39
+
40
+ def detection_loss(y_true, y_pred):
41
+ BFC = tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.SUM)
42
+ return(BFC(y_true, y_pred))
43
+
44
+ def ctc_loss(affinity_matrix, speech_labels, text_labels,n_speech):
45
+ #logit_length
46
+ # n_speech = tf.math.reduce_sum(tf.cast(affinity_matrix._keras_mask, tf.float32), -1)
47
+
48
+ #logit
49
+ transposed_logits = tf.transpose(affinity_matrix, perm=[0, 2, 1])
50
+ # log_probs = tf.math.log(transposed_logits+ 1e-8)
51
+ # logits_approx = log_probs - tf.reduce_max(log_probs, axis=-1, keepdims=True)
52
+
53
+ #label
54
+ matches = tf.equal(speech_labels, text_labels)
55
+ indices = tf.range(text_labels.shape[1], dtype=tf.int32)
56
+ selected_indices = tf.where(matches, indices, tf.fill(tf.shape(text_labels), 0))
57
+ labels = tf.where(tf.equal(text_labels, 0), text_labels, selected_indices)
58
+
59
+ #label_length
60
+ label_length = tf.math.count_nonzero(labels, axis=1)
61
+
62
+ ctc_loss = tf.nn.ctc_loss(labels,transposed_logits,label_length,n_speech,
63
+ logits_time_major=False,
64
+ unique=None,
65
+ blank_index=0,
66
+ name=None)
67
+
68
+ return ctc_loss
69
+
70
+ class TotalLoss(Loss):
71
+ def __init__(self, weight=1.0):
72
+ super().__init__()
73
+ self.weight = weight
74
+
75
+ def __call__(self, y_true, y_pred, reduction='sum'):
76
+ LD = detection_loss(y_true, y_pred)
77
+
78
+ return self.weight * LD, LD
79
+
80
+
81
+ class TotalLoss_SCE(Loss):
82
+ def __init__(self, weight=[1.0, 1.0, 0.2]):
83
+ super().__init__()
84
+ self.weight = weight
85
+
86
+ def __call__(self, y_true, y_pred, speech_label, text_label, logit,affinity_matrix,n_speech, reduction='sum'):
87
+ ctc = ctc_loss(affinity_matrix, speech_label, text_label,n_speech)
88
+
89
+ if self.weight[0] != 0.0:
90
+ LD = detection_loss(y_true, y_pred)
91
+ else:
92
+ LD = 0
93
+ if self.weight[1] != 0.0:
94
+ LC = sequence_cross_entropy(speech_label, text_label, logit, reduction=reduction)
95
+ else:
96
+ LC = 0
97
+ return self.weight[0] * LD + self.weight[1] * LC + self.weight[2]*ctc, LD, LC
criterion/total_ctc1_clap.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, sys
2
+ import tensorflow as tf
3
+ import numpy as np
4
+ from tensorflow.keras.losses import Loss, MeanSquaredError
5
+
6
+ seed = 42
7
+ tf.random.set_seed(seed)
8
+ np.random.seed(seed)
9
+
10
+ def sequence_cross_entropy(speech_label, text_label, logits, reduction='sum'):
11
+ """
12
+ args
13
+ speech_label : [B, Ls]
14
+ text_label : [B, Lt]
15
+ logits : [B, Lt]
16
+ logits._keras_mask : [B, Lt]
17
+ """
18
+ # Data pre-processing
19
+ if tf.shape(text_label)[1] > tf.shape(speech_label)[1]:
20
+ speech_label = tf.pad(speech_label, [[0, 0],[0, tf.shape(text_label)[1] - tf.shape(speech_label)[1]]], 'CONSTANT', constant_values=0)
21
+ elif tf.shape(text_label)[1] < tf.shape(speech_label)[1]:
22
+ speech_label = speech_label[:, :text_label.shape[1]]
23
+
24
+ # Make paired data between text and speech phonemes
25
+ paired_label = tf.math.equal(text_label, speech_label)
26
+ paired_label = tf.cast(tf.math.logical_and(tf.cast(paired_label, tf.bool), tf.cast(logits._keras_mask, tf.bool)), tf.float32)
27
+ paired_label = tf.reshape(tf.ragged.boolean_mask(paired_label, tf.cast(logits._keras_mask, tf.bool)).flat_values, [-1,1])
28
+ logits = tf.reshape(tf.ragged.boolean_mask(logits, tf.cast(logits._keras_mask, tf.bool)).flat_values, [-1,1])
29
+
30
+ # Get BinaryCrossEntropy loss
31
+ BCE = tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.SUM)
32
+ loss = BCE(paired_label, logits)
33
+
34
+ if reduction == 'sum':
35
+ loss = tf.math.divide_no_nan(loss, tf.cast(tf.shape(logits)[0], loss.dtype))
36
+ loss = tf.math.multiply_no_nan(loss, tf.cast(tf.shape(speech_label)[0], loss.dtype))
37
+
38
+ return loss
39
+
40
+ def matrix_loss_0(y_true, y_pred):
41
+ MBC_0 = tf.keras.losses.CategoricalCrossentropy(from_logits=True,reduction=tf.keras.losses.Reduction.SUM)
42
+ return(MBC_0(y_true, y_pred))
43
+
44
+ def matrix_loss_1(y_true, y_pred):
45
+ MBC_1 = tf.keras.losses.CategoricalCrossentropy(from_logits=True,reduction=tf.keras.losses.Reduction.SUM)
46
+ return(MBC_1(y_true, y_pred))
47
+
48
+ def detection_loss(y_true, y_pred):
49
+ BFC = tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.SUM)
50
+ return(BFC(y_true, y_pred))
51
+
52
+ def ctc_loss(affinity_matrix, speech_labels, text_labels,n_speech):
53
+ #logit_length
54
+ # n_speech = tf.math.reduce_sum(tf.cast(affinity_matrix._keras_mask, tf.float32), -1)
55
+
56
+ #logit
57
+ transposed_logits = tf.transpose(affinity_matrix, perm=[0, 2, 1])
58
+ # log_probs = tf.math.log(transposed_logits+ 1e-8)
59
+ # logits_approx = log_probs - tf.reduce_max(log_probs, axis=-1, keepdims=True)
60
+
61
+ #label
62
+ matches = tf.equal(speech_labels, text_labels)
63
+ indices = tf.range(text_labels.shape[1], dtype=tf.int32)
64
+ selected_indices = tf.where(matches, indices, tf.fill(tf.shape(text_labels), 0))
65
+ labels = tf.where(tf.equal(text_labels, 0), text_labels, selected_indices)
66
+
67
+ #label_length
68
+ label_length = tf.math.count_nonzero(labels, axis=1)
69
+
70
+ # mask = tf.not_equal(labels, 0)
71
+ # # 应用mask,使用 tf.ragged.boolean_mask 来处理不同长度的数据
72
+ # labels = tf.ragged.boolean_mask(labels, mask)
73
+
74
+ ctc_loss = tf.nn.ctc_loss(labels,transposed_logits,label_length,n_speech,
75
+ logits_time_major=False,
76
+ unique=None,
77
+ blank_index=0,
78
+ name=None)
79
+
80
+ return ctc_loss
81
+
82
+ class TotalLoss(Loss):
83
+ def __init__(self, weight=1.0):
84
+ super().__init__()
85
+ self.weight = weight
86
+
87
+ def __call__(self, y_true, y_pred, reduction='sum'):
88
+ LD = detection_loss(y_true, y_pred)
89
+
90
+ return self.weight * LD, LD
91
+
92
+
93
+ class TotalLoss_SCE(Loss):
94
+ def __init__(self, weight=[1.0, 1.0, 0.01, 0.01]):
95
+ super().__init__()
96
+ self.weight = weight
97
+
98
+ def __call__(self, y_true, y_pred, speech_label, text_label, logit,prob,affinity_matrix,n_speech, reduction='sum'):
99
+ ctc = ctc_loss(affinity_matrix, speech_label, text_label,n_speech)
100
+
101
+ number_1 = 5
102
+ number_2 = int(y_pred.shape[0]//number_1)
103
+ number_3 = int(y_pred.shape[0]//(number_1*number_1))
104
+ y_pred_1 = tf.reshape(prob,[number_2,number_1])
105
+ y_true_1 = tf.reshape(y_true,[number_2,number_1])
106
+
107
+ loss_audio = matrix_loss_0(y_true_1,y_pred_1)
108
+ x=tf.reshape(prob,[number_3,number_1,number_1])
109
+ x_transposed = tf.transpose(x, perm=[0, 2, 1])
110
+ y_pred_2 = tf.reshape(x_transposed,[number_2,number_1])
111
+ y = tf.reshape(y_true,[number_3,number_1,number_1])
112
+ y_transposed = tf.transpose(y,perm=[0, 2, 1])
113
+ y_true_2 = tf.reshape(y_transposed,[number_2,number_1])
114
+ loss_text = matrix_loss_1(y_true_2,y_pred_2)
115
+ loss = 0.5*loss_audio + 0.5*loss_text
116
+
117
+ if self.weight[0] != 0.0:
118
+ LD = detection_loss(y_true, y_pred)
119
+ else:
120
+ LD = 0
121
+ if self.weight[1] != 0.0:
122
+ LC = sequence_cross_entropy(speech_label, text_label, logit, reduction=reduction)
123
+ else:
124
+ LC = 0
125
+ return self.weight[0] * LD + self.weight[1] * LC + self.weight[2]*ctc + self.weight[3]*loss, LD, LC
criterion/utils.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import sklearn.metrics
3
+ import tensorflow as tf
4
+
5
+ def compute_eer(label, pred):
6
+ # all fpr, tpr, fnr, fnr, threshold are lists (in the format of np.array)
7
+ fpr, tpr, threshold = sklearn.metrics.roc_curve(label, pred)
8
+ fnr = 1 - tpr
9
+
10
+ # the threshold of fnr == fpr
11
+ eer_threshold = threshold[np.nanargmin(np.absolute((fnr - fpr)))]
12
+
13
+ # theoretically eer from fpr and eer from fnr should be identical but they can be slightly differ in reality
14
+ eer_1 = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
15
+ eer_2 = fnr[np.nanargmin(np.absolute((fnr - fpr)))]
16
+
17
+ # return the mean of eer from fpr and from fnr
18
+ eer = (eer_1 + eer_2) / 2
19
+ return eer
20
+
21
+ class eer(tf.keras.metrics.Metric):
22
+ def __init__(self, name='equal_error_rate', **kwargs):
23
+ super(eer, self).__init__(name=name, **kwargs)
24
+ self.score = self.add_weight(name='eer', initializer='zeros')
25
+ self.count = self.add_weight(name='count', initializer='zeros')
26
+
27
+ def update_state(self, y_true, y_pred):
28
+ self.score.assign_add(tf.reduce_sum(tf.py_function(func=compute_eer, inp=[y_true, y_pred], Tout=tf.float32, name='compute_eer')))
29
+ self.count.assign_add(1)
30
+
31
+ def result(self):
32
+ return tf.math.divide_no_nan(self.score, self.count)
dataset/__pycache__/dataloader_demo.cpython-37.pyc ADDED
Binary file (7.73 kB). View file
 
dataset/__pycache__/dataloader_infe.cpython-37.pyc ADDED
Binary file (6.73 kB). View file
 
dataset/__pycache__/google.cpython-37.pyc ADDED
Binary file (8.57 kB). View file
 
dataset/__pycache__/google_infe202405.cpython-37.pyc ADDED
Binary file (8.64 kB). View file
 
dataset/__pycache__/libriphrase.cpython-37.pyc ADDED
Binary file (13.6 kB). View file
 
dataset/__pycache__/libriphrase_ctc1.cpython-37.pyc ADDED
Binary file (14.3 kB). View file
 
dataset/__pycache__/qualcomm.cpython-37.pyc ADDED
Binary file (8.06 kB). View file
 
dataset/dataloader_demo.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math, os, re, sys
2
+ from pathlib import Path
3
+ import numpy as np
4
+ import pandas as pd
5
+ from multiprocessing import Pool
6
+ from scipy.io import wavfile
7
+ import tensorflow as tf
8
+ from pydub import AudioSegment
9
+ from tensorflow.keras.utils import Sequence, OrderedEnqueuer
10
+ from tensorflow.keras import layers
11
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
12
+
13
+ sys.path.append(os.path.dirname(__file__))
14
+ from g2p.g2p_en.g2p import G2p
15
+
16
+ import warnings
17
+ warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
18
+ np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
19
+
20
+ class GoogleCommandsDataloader(Sequence):
21
+ def __init__(self,
22
+ batch_size,
23
+ fs = 16000,
24
+ keyword=['realtek go','ok google','vintage','hackney','crocodile','surroundings','oversaw','northwestern'],
25
+ wav_path_or_object='/share/nas165/yiting/recording/ok_google/Default_20240725-183008.wav',
26
+ features='g2p_embed', # phoneme, g2p_embed, both ...
27
+
28
+ ):
29
+
30
+ phonemes = ["<pad>", ] + ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
31
+ 'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH',
32
+ 'D', 'DH', 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1',
33
+ 'EY2', 'F', 'G', 'HH', 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2',
34
+ 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0',
35
+ 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH0', 'UH1',
36
+ 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH',
37
+ ' ']
38
+
39
+ self.p2idx = {p: idx for idx, p in enumerate(phonemes)}
40
+ self.idx2p = {idx: p for idx, p in enumerate(phonemes)}
41
+
42
+ self.batch_size = batch_size
43
+ self.fs = fs
44
+ self.features = features
45
+ self.nPhoneme = len(phonemes)
46
+ self.g2p = G2p()
47
+ self.keyword = keyword
48
+ self.wav = wav_path_or_object
49
+ self.__prep__()
50
+ self.on_epoch_end()
51
+
52
+ def __prep__(self):
53
+ self.data = pd.DataFrame(columns=['wav', 'text', 'duration', 'label'])
54
+ anchor = ' '
55
+ target_dict = {}
56
+ if isinstance(self.wav, str):
57
+ anchor = self.wav.split('/')[-2].lower().replace('_', ' ')
58
+ duration = float(wavfile.read(self.wav)[1].shape[-1])/self.fs
59
+ else:
60
+ duration = float(self.wav[1].shape[-1])/self.fs
61
+
62
+ # duration = float(wavfile.read(self.wav)[1].shape[-1])/self.fs
63
+ # duration = float(self.wav_path_or_object.shape[-1])/self.fs
64
+
65
+ for i, comparison_text in enumerate(self.keyword):
66
+ label = 1 if comparison_text == anchor else 0
67
+ target_dict[i] = {
68
+ 'wav': self.wav,
69
+ 'text': comparison_text,
70
+ 'duration': duration,
71
+ 'label': label
72
+ }
73
+
74
+ print(target_dict)
75
+ self.data = self.data.append(pd.DataFrame.from_dict(target_dict, 'index'), ignore_index=True)
76
+ print(self.data)
77
+ # g2p & p2idx by g2p_en package
78
+ print(">> Convert word to phoneme")
79
+ self.data['phoneme'] = self.data['text'].apply(lambda x: self.g2p(re.sub(r"[^a-zA-Z0-9]+", ' ', x)))
80
+ print(">> Convert phoneme to index")
81
+ self.data['pIndex'] = self.data['phoneme'].apply(lambda x: [self.p2idx[t] for t in x])
82
+ print(">> Compute phoneme embedding")
83
+ self.data['g2p_embed'] = self.data['text'].apply(lambda x: self.g2p.embedding(x))
84
+
85
+ # if (self.pkl is not None) and (not os.path.isfile(self.pkl)):
86
+ # self.data.to_pickle(self.pkl)
87
+
88
+
89
+ # Get longest data
90
+ self.wav_list = self.data['wav'].values
91
+ self.idx_list = self.data['pIndex'].values
92
+ # self.idx_list = [np.insert(lst, 0, 0) for lst in self.idx_list]
93
+ # self.sIdx_list = [np.insert(lst, 0, 0) for lst in self.sIdx_list]
94
+ self.emb_list = self.data['g2p_embed'].values
95
+ self.lab_list = self.data['label'].values
96
+ self.data = self.data.sort_values(by='duration').reset_index(drop=True)
97
+
98
+ # Set dataloader params.
99
+ self.len = len(self.data)
100
+ self.maxlen_t = int((int(self.data['text'].apply(lambda x: len(x)).max() / 10) + 1) * 10)
101
+ # self.maxlen_a = int(((int(self.data['duration'].values[-1] / 0.5) + 1 ) * self.fs / 2)*1.2)
102
+ # print(self.maxlen_a)
103
+ self.maxlen_a = 56000
104
+ def __len__(self):
105
+ # return total batch-wise length
106
+ return math.ceil(self.len / self.batch_size)
107
+
108
+ def _load_wav(self, wav):
109
+ return np.array(wavfile.read(wav)[1]).astype(np.float32) / 32768.0
110
+
111
+ def __getitem__(self, idx):
112
+ # chunking
113
+ indices = self.indices[idx * self.batch_size : (idx + 1) * self.batch_size]
114
+
115
+ # load inputs
116
+ if isinstance(self.wav, str):
117
+ batch_x = [np.array(wavfile.read(self.wav_list[i])[1]).astype(np.float32) / 32768.0 for i in indices]
118
+ else:
119
+ batch_x = [np.array((self.wav_list[i])[1]).astype(np.float32)/ 32768.0 for i in indices]
120
+ # batch_x = [np.array(wavfile.read(self.wav_list[i])[1]).astype(np.float32) / 32768.0 for i in indices]
121
+ if self.features == 'both':
122
+ batch_p = [np.array(self.idx_list[i]).astype(np.int32) for i in indices]
123
+ batch_e = [np.array(self.emb_list[i]).astype(np.float32) for i in indices]
124
+ else:
125
+ if self.features == 'phoneme':
126
+ batch_y = [np.array(self.idx_list[i]).astype(np.int32) for i in indices]
127
+ elif self.features == 'g2p_embed':
128
+ batch_y = [np.array(self.emb_list[i]).astype(np.float32) for i in indices]
129
+ # load outputs
130
+ batch_z = [np.array([self.lab_list[i]]).astype(np.float32) for i in indices]
131
+
132
+ # padding and masking
133
+ pad_batch_x = pad_sequences(np.array(batch_x), maxlen=self.maxlen_a, value=0.0, padding='post', dtype=batch_x[0].dtype)
134
+ if self.features == 'both':
135
+ pad_batch_p = pad_sequences(np.array(batch_p), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_p[0].dtype)
136
+ pad_batch_e = pad_sequences(np.array(batch_e), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_e[0].dtype)
137
+ else:
138
+ pad_batch_y = pad_sequences(np.array(batch_y), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_y[0].dtype)
139
+ pad_batch_z = pad_sequences(np.array(batch_z), value=0.0, padding='post', dtype=batch_z[0].dtype)
140
+
141
+ if self.features == 'both':
142
+ return pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z
143
+ else:
144
+ return pad_batch_x, pad_batch_y, pad_batch_z
145
+
146
+ def on_epoch_end(self):
147
+ self.indices = np.arange(self.len)
148
+ # if self.shuffle == True:
149
+ # np.random.shuffle(self.indices)
150
+
151
+ def convert_sequence_to_dataset(dataloader):
152
+ def data_generator():
153
+ for i in range(dataloader.__len__()):
154
+ if dataloader.features == 'both':
155
+ pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z = dataloader[i]
156
+ yield pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z
157
+ else:
158
+ pad_batch_x, pad_batch_y, pad_batch_z = dataloader[i]
159
+ yield pad_batch_x, pad_batch_y, pad_batch_z
160
+
161
+ if dataloader.features == 'both':
162
+ data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
163
+ tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
164
+ tf.TensorSpec(shape=(None, dataloader.maxlen_t), dtype=tf.int32),
165
+ tf.TensorSpec(shape=(None, dataloader.maxlen_t, 256), dtype=tf.float32),
166
+ tf.TensorSpec(shape=(None, 1), dtype=tf.float32),)
167
+ )
168
+ else:
169
+ data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
170
+ tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
171
+ tf.TensorSpec(shape=(None, dataloader.maxlen_t) if dataloader.features == 'phoneme' else (None, dataloader.maxlen_t, 256),
172
+ dtype=tf.int32 if dataloader.features == 'phoneme' else tf.float32),
173
+ tf.TensorSpec(shape=(None, 1), dtype=tf.float32),)
174
+ )
175
+ # data_dataset = data_dataset.cache()
176
+ # data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=output_signature)
177
+ data_dataset = data_dataset.prefetch(1)
178
+
179
+ return data_dataset
180
+
181
+ if __name__ == '__main__':
182
+ dataloader = GoogleCommandsDataloader(2048, testset_only=True, pkl='/home/DB/google_speech_commands/google_testset.pkl', features='g2p_embed')
dataset/dataloader_infe.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math, os, re, sys
2
+ from pathlib import Path
3
+ import numpy as np
4
+ import pandas as pd
5
+ from multiprocessing import Pool
6
+ from scipy.io import wavfile
7
+ import tensorflow as tf
8
+
9
+ from tensorflow.keras.utils import Sequence, OrderedEnqueuer
10
+ from tensorflow.keras import layers
11
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
12
+
13
+ sys.path.append(os.path.dirname(__file__))
14
+ from g2p.g2p_en.g2p import G2p
15
+
16
+ import warnings
17
+ warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
18
+ np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
19
+
20
+
21
+ def dataloader(fs = 16000,keyword='',wav_path_or_object=None,g2p=None,
22
+ features='both' # phoneme, g2p_embed, both ...
23
+ ):
24
+
25
+ phonemes = ["<pad>", ] + ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
26
+ 'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH',
27
+ 'D', 'DH', 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1',
28
+ 'EY2', 'F', 'G', 'HH', 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2',
29
+ 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0',
30
+ 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH0', 'UH1',
31
+ 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH',
32
+ ' ']
33
+
34
+ p2idx = {p: idx for idx, p in enumerate(phonemes)}
35
+ idx2p = {idx: p for idx, p in enumerate(phonemes)}
36
+
37
+ fs = fs
38
+ wav_path_or_object = wav_path_or_object
39
+ keyword = keyword
40
+
41
+ features = features
42
+ # g2p = G2p()
43
+
44
+
45
+ data = pd.DataFrame(columns=['wav','wav_label', 'text', 'duration', 'label'])
46
+
47
+ target_dict = {}
48
+ idx = 0
49
+
50
+ wav = wav_path_or_object
51
+ keyword = keyword
52
+ if isinstance(wav_path_or_object, str):
53
+ duration = float(wavfile.read(wav)[1].shape[-1])/fs
54
+ else:
55
+ duration = float(wav_path_or_object.shape[-1])/fs
56
+ label = 1
57
+ anchor_text = wav.split('/')[-2].lower()
58
+ target_dict[idx] = {
59
+ 'wav': wav,
60
+ 'wav_label': anchor_text,
61
+ 'text': keyword,
62
+ 'duration': duration,
63
+ 'label': label
64
+ }
65
+ data = data.append(pd.DataFrame.from_dict(target_dict, 'index'), ignore_index=True)
66
+
67
+ # g2p & p2idx by g2p_en package
68
+ # print(">> Convert word to phoneme")
69
+ data['phoneme'] = data['text'].apply(lambda x: g2p(re.sub(r"[^a-zA-Z0-9]+", ' ', x)))
70
+ # print(">> Convert phoneme to index")
71
+ data['pIndex'] = data['phoneme'].apply(lambda x: [p2idx[t] for t in x])
72
+ # print(">> Compute phoneme embedding")
73
+ data['g2p_embed'] = data['text'].apply(lambda x: g2p.embedding(x))
74
+ data['wav_phoneme'] = data['wav_label'].apply(lambda x: g2p(re.sub(r"[^a-zA-Z0-9]+", ' ', x)))
75
+ data['wav_pIndex'] = data['wav_phoneme'].apply(lambda x: [p2idx[t] for t in x])
76
+ # print(data['phoneme'])
77
+ # Get longest data
78
+ data = data.sort_values(by='duration').reset_index(drop=True)
79
+ wav_list = data['wav'].values
80
+ idx_list = data['pIndex'].values
81
+ emb_list = data['g2p_embed'].values
82
+ lab_list = data['label'].values
83
+ sIdx_list = data['wav_pIndex'].values
84
+ # Set dataloader params.
85
+ # len = len(data)
86
+ maxlen_t = int((int(data['text'].apply(lambda x: len(x)).max() / 10) + 1) * 10)
87
+ maxlen_a = int((int(data['duration'].values[-1] / 0.5) + 1 ) * fs / 2)
88
+ maxlen_l = int((int(data['wav_label'].apply(lambda x: len(x)).max() / 10) + 1) * 10)
89
+ indices = [0]
90
+
91
+ # load inputs
92
+ if isinstance(wav_path_or_object, str):
93
+ batch_x = [np.array(wavfile.read(wav_list[i])[1]).astype(np.float32) / 32768.0 for i in indices]
94
+ else:
95
+ batch_x = [wav_list[i] / 32768.0 for i in indices]
96
+ if features == 'both':
97
+ batch_p = [np.array(idx_list[i]).astype(np.int32) for i in indices]
98
+ batch_e = [np.array(emb_list[i]).astype(np.float32) for i in indices]
99
+ else:
100
+ if features == 'phoneme':
101
+ batch_y = [np.array(idx_list[i]).astype(np.int32) for i in indices]
102
+ elif features == 'g2p_embed':
103
+ batch_y = [np.array(emb_list[i]).astype(np.float32) for i in indices]
104
+ # load outputs
105
+ batch_z = [np.array([lab_list[i]]).astype(np.float32) for i in indices]
106
+ batch_l = [np.array(sIdx_list[i]).astype(np.int32) for i in indices]
107
+ # padding and masking
108
+ pad_batch_x = pad_sequences(np.array(batch_x), maxlen=maxlen_a, value=0.0, padding='post', dtype=batch_x[0].dtype)
109
+ if features == 'both':
110
+ pad_batch_p = pad_sequences(np.array(batch_p), maxlen=maxlen_t, value=0.0, padding='post', dtype=batch_p[0].dtype)
111
+ pad_batch_e = pad_sequences(np.array(batch_e), maxlen=maxlen_t, value=0.0, padding='post', dtype=batch_e[0].dtype)
112
+ else:
113
+ pad_batch_y = pad_sequences(np.array(batch_y), maxlen=maxlen_t, value=0.0, padding='post', dtype=batch_y[0].dtype)
114
+ pad_batch_z = pad_sequences(np.array(batch_z), value=0.0, padding='post', dtype=batch_z[0].dtype)
115
+ pad_batch_l = pad_sequences(np.array(batch_l), maxlen=maxlen_l, value=0.0, padding='post', dtype=batch_l[0].dtype)
116
+ if features == 'both':
117
+ return pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z,batch_l
118
+ else:
119
+ return pad_batch_x, pad_batch_y, pad_batch_z,batch_l
120
+
121
+ # def _load_wav(self, wav):
122
+ # return np.array(wavfile.read(wav)[1]).astype(np.float32) / 32768.0
123
+
124
+
125
+
126
+
127
+ def convert_sequence_to_dataset(dataloader, wav, text, features):
128
+ fs = 16000
129
+ features=features
130
+ duration = float(wavfile.read(wav)[1].shape[-1])/fs
131
+ maxlen_t = int((int(len(text) / 10) + 1) * 10)
132
+ maxlen_a = int((int(duration / 0.5) + 1 ) * fs / 2)
133
+ wav_label = wav.split('/')[-2].lower()
134
+
135
+
136
+ def data_generator():
137
+
138
+ if features == 'both':
139
+ pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z, pad_batch_l = dataloader
140
+ yield pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z, pad_batch_l
141
+ else:
142
+ pad_batch_x, pad_batch_y, pad_batch_z, pad_batch_l = dataloader
143
+ yield pad_batch_x, pad_batch_y, pad_batch_z, pad_batch_l
144
+
145
+ if features == 'both':
146
+ data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
147
+ tf.TensorSpec(shape=(None, maxlen_a), dtype=tf.float32),
148
+ tf.TensorSpec(shape=(None, maxlen_t), dtype=tf.int32),
149
+ tf.TensorSpec(shape=(None, maxlen_t, 256), dtype=tf.float32),
150
+ tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
151
+ tf.TensorSpec(shape=(None, None), dtype=tf.int32),)
152
+ )
153
+ else:
154
+ data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
155
+ tf.TensorSpec(shape=(None, maxlen_a), dtype=tf.float32),
156
+ tf.TensorSpec(shape=(None, maxlen_t) if features == 'phoneme' else (None, maxlen_t, 256),
157
+ dtype=tf.int32 if features == 'phoneme' else tf.float32),
158
+ tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
159
+ tf.TensorSpec(shape=(None, None), dtype=tf.int32),)
160
+ )
161
+ # data_dataset = data_dataset.cache()
162
+ data_dataset = data_dataset.prefetch(1)
163
+
164
+ return data_dataset
dataset/g2p/LICENSE.txt ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "{}"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright {yyyy} {name of copyright owner}
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
dataset/g2p/g2p_en/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .g2p import G2p
dataset/g2p/g2p_en/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (186 Bytes). View file
 
dataset/g2p/g2p_en/__pycache__/expand.cpython-37.pyc ADDED
Binary file (2.39 kB). View file
 
dataset/g2p/g2p_en/__pycache__/g2p.cpython-37.pyc ADDED
Binary file (8.05 kB). View file
 
dataset/g2p/g2p_en/checkpoint20.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8af35e4596d8dd5836dfd3fe9b2ba4f97b9c311efe8879544cbcfcbd566d8c6
3
+ size 3342298
dataset/g2p/g2p_en/expand.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ #/usr/bin/python2
3
+ '''
4
+ Borrowed
5
+ from https://github.com/keithito/tacotron/blob/master/text/numbers.py
6
+ By kyubyong park. [email protected].
7
+ https://www.github.com/kyubyong/g2p
8
+ '''
9
+ from __future__ import print_function
10
+ import inflect
11
+ import re
12
+
13
+
14
+
15
+ _inflect = inflect.engine()
16
+ _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
17
+ _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
18
+ _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
19
+ _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
20
+ _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
21
+ _number_re = re.compile(r'[0-9]+')
22
+
23
+
24
+ def _remove_commas(m):
25
+ return m.group(1).replace(',', '')
26
+
27
+
28
+ def _expand_decimal_point(m):
29
+ return m.group(1).replace('.', ' point ')
30
+
31
+
32
+ def _expand_dollars(m):
33
+ match = m.group(1)
34
+ parts = match.split('.')
35
+ if len(parts) > 2:
36
+ return match + ' dollars' # Unexpected format
37
+ dollars = int(parts[0]) if parts[0] else 0
38
+ cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
39
+ if dollars and cents:
40
+ dollar_unit = 'dollar' if dollars == 1 else 'dollars'
41
+ cent_unit = 'cent' if cents == 1 else 'cents'
42
+ return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
43
+ elif dollars:
44
+ dollar_unit = 'dollar' if dollars == 1 else 'dollars'
45
+ return '%s %s' % (dollars, dollar_unit)
46
+ elif cents:
47
+ cent_unit = 'cent' if cents == 1 else 'cents'
48
+ return '%s %s' % (cents, cent_unit)
49
+ else:
50
+ return 'zero dollars'
51
+
52
+
53
+ def _expand_ordinal(m):
54
+ return _inflect.number_to_words(m.group(0))
55
+
56
+
57
+ def _expand_number(m):
58
+ num = int(m.group(0))
59
+ if num > 1000 and num < 3000:
60
+ if num == 2000:
61
+ return 'two thousand'
62
+ elif num > 2000 and num < 2010:
63
+ return 'two thousand ' + _inflect.number_to_words(num % 100)
64
+ elif num % 100 == 0:
65
+ return _inflect.number_to_words(num // 100) + ' hundred'
66
+ else:
67
+ return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
68
+ else:
69
+ return _inflect.number_to_words(num, andword='')
70
+
71
+
72
+ def normalize_numbers(text):
73
+ text = re.sub(_comma_number_re, _remove_commas, text)
74
+ text = re.sub(_pounds_re, r'\1 pounds', text)
75
+ text = re.sub(_dollars_re, _expand_dollars, text)
76
+ text = re.sub(_decimal_number_re, _expand_decimal_point, text)
77
+ text = re.sub(_ordinal_re, _expand_ordinal, text)
78
+ text = re.sub(_number_re, _expand_number, text)
79
+ return text
dataset/g2p/g2p_en/g2p.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # /usr/bin/python
3
+ '''
4
+ By kyubyong park([email protected]) and Jongseok Kim(https://github.com/ozmig77)
5
+ https://www.github.com/kyubyong/g2p
6
+ '''
7
+ from nltk import pos_tag
8
+ from nltk.corpus import cmudict
9
+ import nltk
10
+ from nltk.tokenize import TweetTokenizer
11
+ word_tokenize = TweetTokenizer().tokenize
12
+ import numpy as np
13
+ import codecs
14
+ import re
15
+ import os, sys
16
+ import unicodedata
17
+ from builtins import str as unicode
18
+
19
+ sys.path.append(os.path.dirname(__file__))
20
+ from expand import normalize_numbers
21
+
22
+ try:
23
+ nltk.data.find('taggers/averaged_perceptron_tagger.zip')
24
+ except LookupError:
25
+ nltk.download('averaged_perceptron_tagger')
26
+ try:
27
+ nltk.data.find('corpora/cmudict.zip')
28
+ except LookupError:
29
+ nltk.download('cmudict')
30
+
31
+ dirname = os.path.dirname(__file__)
32
+
33
+ def construct_homograph_dictionary():
34
+ f = os.path.join(dirname,'homographs.en')
35
+ homograph2features = dict()
36
+ for line in codecs.open(f, 'r', 'utf8').read().splitlines():
37
+ if line.startswith("#"): continue # comment
38
+ headword, pron1, pron2, pos1 = line.strip().split("|")
39
+ homograph2features[headword.lower()] = (pron1.split(), pron2.split(), pos1)
40
+ return homograph2features
41
+
42
+ # def segment(text):
43
+ # '''
44
+ # Splits text into `tokens`.
45
+ # :param text: A string.
46
+ # :return: A list of tokens (string).
47
+ # '''
48
+ # print(text)
49
+ # text = re.sub('([.,?!]( |$))', r' \1', text)
50
+ # print(text)
51
+ # return text.split()
52
+
53
+ class G2p(object):
54
+ def __init__(self):
55
+ super().__init__()
56
+ self.graphemes = ["<pad>", "<unk>", "</s>"] + list("abcdefghijklmnopqrstuvwxyz")
57
+ self.phonemes = ["<pad>", "<unk>", "<s>", "</s>"] + ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
58
+ 'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D', 'DH',
59
+ 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1',
60
+ 'EY2', 'F', 'G', 'HH',
61
+ 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L',
62
+ 'M', 'N', 'NG', 'OW0', 'OW1',
63
+ 'OW2', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH',
64
+ 'UH0', 'UH1', 'UH2', 'UW',
65
+ 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH']
66
+ self.g2idx = {g: idx for idx, g in enumerate(self.graphemes)}
67
+ self.idx2g = {idx: g for idx, g in enumerate(self.graphemes)}
68
+
69
+ self.p2idx = {p: idx for idx, p in enumerate(self.phonemes)}
70
+ self.idx2p = {idx: p for idx, p in enumerate(self.phonemes)}
71
+
72
+ self.cmu = cmudict.dict()
73
+ self.load_variables()
74
+ self.homograph2features = construct_homograph_dictionary()
75
+
76
+ def load_variables(self):
77
+ self.variables = np.load(os.path.join(dirname,'checkpoint20.npz'))
78
+ self.enc_emb = self.variables["enc_emb"] # (29, 64). (len(graphemes), emb)
79
+ self.enc_w_ih = self.variables["enc_w_ih"] # (3*128, 64)
80
+ self.enc_w_hh = self.variables["enc_w_hh"] # (3*128, 128)
81
+ self.enc_b_ih = self.variables["enc_b_ih"] # (3*128,)
82
+ self.enc_b_hh = self.variables["enc_b_hh"] # (3*128,)
83
+
84
+ self.dec_emb = self.variables["dec_emb"] # (74, 64). (len(phonemes), emb)
85
+ self.dec_w_ih = self.variables["dec_w_ih"] # (3*128, 64)
86
+ self.dec_w_hh = self.variables["dec_w_hh"] # (3*128, 128)
87
+ self.dec_b_ih = self.variables["dec_b_ih"] # (3*128,)
88
+ self.dec_b_hh = self.variables["dec_b_hh"] # (3*128,)
89
+ self.fc_w = self.variables["fc_w"] # (74, 128)
90
+ self.fc_b = self.variables["fc_b"] # (74,)
91
+
92
+ def sigmoid(self, x):
93
+ return 1 / (1 + np.exp(-x))
94
+
95
+ def grucell(self, x, h, w_ih, w_hh, b_ih, b_hh):
96
+ rzn_ih = np.matmul(x, w_ih.T) + b_ih
97
+ rzn_hh = np.matmul(h, w_hh.T) + b_hh
98
+
99
+ rz_ih, n_ih = rzn_ih[:, :rzn_ih.shape[-1] * 2 // 3], rzn_ih[:, rzn_ih.shape[-1] * 2 // 3:]
100
+ rz_hh, n_hh = rzn_hh[:, :rzn_hh.shape[-1] * 2 // 3], rzn_hh[:, rzn_hh.shape[-1] * 2 // 3:]
101
+
102
+ rz = self.sigmoid(rz_ih + rz_hh)
103
+ r, z = np.split(rz, 2, -1)
104
+
105
+ n = np.tanh(n_ih + r * n_hh)
106
+ h = (1 - z) * n + z * h
107
+
108
+ return h
109
+
110
+ def gru(self, x, steps, w_ih, w_hh, b_ih, b_hh, h0=None):
111
+ if h0 is None:
112
+ h0 = np.zeros((x.shape[0], w_hh.shape[1]), np.float32)
113
+ h = h0 # initial hidden state
114
+ outputs = np.zeros((x.shape[0], steps, w_hh.shape[1]), np.float32)
115
+ for t in range(steps):
116
+ h = self.grucell(x[:, t, :], h, w_ih, w_hh, b_ih, b_hh) # (b, h)
117
+ outputs[:, t, ::] = h
118
+ return outputs
119
+
120
+ def encode(self, word):
121
+ chars = list(word) + ["</s>"]
122
+ x = [self.g2idx.get(char, self.g2idx["<unk>"]) for char in chars]
123
+ x = np.take(self.enc_emb, np.expand_dims(x, 0), axis=0)
124
+
125
+ return x
126
+
127
+ def predict(self, word):
128
+ # encoder
129
+ enc = self.encode(word)
130
+ enc = self.gru(enc, len(word) + 1, self.enc_w_ih, self.enc_w_hh,
131
+ self.enc_b_ih, self.enc_b_hh, h0=np.zeros((1, self.enc_w_hh.shape[-1]), np.float32))
132
+ last_hidden = enc[:, -1, :]
133
+
134
+ # decoder
135
+ dec = np.take(self.dec_emb, [2], axis=0) # 2: <s>
136
+ h = last_hidden
137
+
138
+ preds = []
139
+ for i in range(20):
140
+ h = self.grucell(dec, h, self.dec_w_ih, self.dec_w_hh, self.dec_b_ih, self.dec_b_hh) # (b, h)
141
+ logits = np.matmul(h, self.fc_w.T) + self.fc_b
142
+ pred = logits.argmax()
143
+ if pred == 3: break # 3: </s>
144
+ preds.append(pred)
145
+ dec = np.take(self.dec_emb, [pred], axis=0)
146
+
147
+ preds = [self.idx2p.get(idx, "<unk>") for idx in preds]
148
+
149
+ return preds
150
+
151
+ def __call__(self, text):
152
+ # preprocessing
153
+ text = unicode(text)
154
+ text = normalize_numbers(text)
155
+ text = ''.join(char for char in unicodedata.normalize('NFD', text)
156
+ if unicodedata.category(char) != 'Mn') # Strip accents
157
+ text = text.lower()
158
+ text = text.replace("_", " ")
159
+ text = re.sub("[^ a-z'.,?!\-]", "", text)
160
+ text = text.replace("i.e.", "that is")
161
+ text = text.replace("e.g.", "for example")
162
+
163
+ # tokenization
164
+ words = word_tokenize(text)
165
+ tokens = pos_tag(words) # tuples of (word, tag)
166
+
167
+ # steps
168
+ prons = []
169
+ for word in words:
170
+ if re.search("[a-z]", word) is None:
171
+ continue
172
+
173
+ # elif word in self.homograph2features: # Check homograph
174
+ # pron1, pron2, pos1 = self.homograph2features[word]
175
+ # if pos.startswith(pos1):
176
+ # pron = pron1
177
+ # else:
178
+ # pron = pron2
179
+ # elif word in self.cmu: # lookup CMU dict
180
+ # pron = self.cmu[word][0]
181
+ # else: # predict for oov
182
+
183
+ pron = self.predict(word)
184
+
185
+ prons.extend(pron)
186
+ prons.extend([" "])
187
+
188
+ return prons[:-1]
189
+
190
+ def embedding(self, text):
191
+ # preprocessing
192
+ text = unicode(text)
193
+ text = normalize_numbers(text)
194
+ text = ''.join(char for char in unicodedata.normalize('NFD', text)
195
+ if unicodedata.category(char) != 'Mn') # Strip accents
196
+ text = text.lower()
197
+ text = re.sub("[^ a-z'.,?!\-]", "", text)
198
+ text = text.replace("i.e.", "that is")
199
+ text = text.replace("e.g.", "for example")
200
+
201
+ # tokenization
202
+ words = word_tokenize(text)
203
+
204
+ # embedding func.
205
+ def _get(self, word):
206
+ # encoder
207
+ enc = self.encode(word)
208
+ enc = self.gru(enc, len(word) + 1, self.enc_w_ih, self.enc_w_hh,
209
+ self.enc_b_ih, self.enc_b_hh, h0=np.zeros((1, self.enc_w_hh.shape[-1]), np.float32))
210
+ last_hidden = enc[:, -1, :]
211
+
212
+ # decoder
213
+ dec = np.take(self.dec_emb, [2], axis=0) # 2: <s>
214
+ h = last_hidden
215
+
216
+ preds = []
217
+ emb = np.empty((0, self.dec_emb[0,:].shape[-1]))
218
+ for i in range(20):
219
+ h = self.grucell(dec, h, self.dec_w_ih, self.dec_w_hh, self.dec_b_ih, self.dec_b_hh) # (b, h)
220
+ logits = np.matmul(h, self.fc_w.T) + self.fc_b
221
+ pred = logits.argmax()
222
+ if pred == 3: break # 3: </s>
223
+ dec = np.take(self.dec_emb, [pred], axis=0)
224
+ emb = np.append(emb, h, axis=0)
225
+
226
+ return emb
227
+
228
+ # steps
229
+ embed = np.empty((0, self.dec_emb[0,:].shape[-1]))
230
+ for word in words:
231
+ if re.search("[a-z]", word) is None:
232
+ continue
233
+ embed = np.append(embed, _get(self, word), axis=0)
234
+ embed = np.append(embed, np.take(self.dec_emb, [0], axis=0), axis=0)
235
+
236
+ return embed[:-1,:]
237
+
238
+ if __name__ == '__main__':
239
+ texts = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'hey_android', 'hey_snapdragon', 'hi_galaxy', 'hi_lumina']
240
+ # "I have $250 in my pocket.", # number -> spell-out
241
+ # "popular pets, e.g. cats and dogs", # e.g. -> for example
242
+ # "I refuse to collect the refuse around here.", # homograph
243
+ # "I'm an activationist."] # newly coined word
244
+ g2p = G2p()
245
+ for text in texts:
246
+ out = g2p(text)
247
+ emb = g2p.embedding(text)
248
+ print(out)
249
+ print(emb.shape)
dataset/g2p/g2p_en/homographs.en ADDED
@@ -0,0 +1,379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #This is based on http://www minpairs talktalk net/graph html
2
+ #Each line is formatted as follows:
3
+ #HEADWORD|PRONUNCIATION1|PRONUNCIATION2|POS
4
+ #HEADWORD should have PRONUNCIATION1 only if it's part-of-speech is POS
5
+ #Otherwise PRONUNCIATION2 is applied
6
+ #May, 2018
7
+ #Kyubyong Park
8
+ #https://github|com/kyubyong/g2p
9
+ ABSENT|AH1 B S AE1 N T|AE1 B S AH0 N T|V
10
+ ABSTRACT|AE0 B S T R AE1 K T|AE1 B S T R AE2 K T|V
11
+ ABSTRACTS|AE0 B S T R AE1 K T S|AE1 B S T R AE0 K T S|V
12
+ ABUSE|AH0 B Y UW1 Z|AH0 B Y UW1 S|V
13
+ ABUSES|AH0 B Y UW1 Z IH0 Z|AH0 B Y UW1 S IH0 Z|V
14
+ ACCENT|AH0 K S EH1 N T|AE1 K S EH2 N T|V
15
+ ACCENTS|AE1 K S EH0 N T S|AE1 K S EH0 N T S|V
16
+ ADDICT|AH0 D IH1 K T|AE1 D IH2 K T|V
17
+ ADDICTS|AH0 D IH1 K T S|AE1 D IH2 K T S|V
18
+ ADVOCATE|AE1 D V AH0 K EY2 T|AE1 D V AH0 K AH0 T|V
19
+ ADVOCATES|AE1 D V AH0 K EY2 T S|AE1 D V AH0 K AH0 T S|V
20
+ AFFECT|AH0 F EH1 K T|AE1 F EH0 K T|V
21
+ AFFECTS|AH0 F EH1 K T S|AE1 F EH0 K T S|V
22
+ AFFIX|AH0 F IH1 K S|AE1 F IH0 K S|V
23
+ AFFIXES|AH0 F IH1 K S IH0 Z|AE1 F IH0 K S IH0 Z|V
24
+ AGGLOMERATE|AH0 G L AA1 M ER0 EY2 T|AH0 G L AA1 M ER0 AH0 T|V
25
+ AGGREGATE|AE1 G R AH0 G EY0 T|AE1 G R AH0 G AH0 T|V
26
+ AGGREGATES|AE1 G R AH0 G EY2 T S|AE1 G R AH0 G IH0 T S|V
27
+ ALLIES|AH0 L AY1 Z|AE1 L AY0 Z|V
28
+ ALLOY|AH0 L OY1|AE1 L OY2|V
29
+ ALLOYS|AH0 L OY1 Z|AE1 L OY2 Z|V
30
+ ALLY|AH0 L AY1|AE1 L AY0|V
31
+ ALTERNATE|AO1 L T ER0 N EY2 T|AO0 L T ER1 N AH0 T|V
32
+ ANALYSES|AH0 N AE1 L IH0 S IY2 Z|AE1 N AH0 L AY0 Z IH2 Z|V
33
+ ANIMATE|AE1 N AH0 M EY2 T|AE1 N AH0 M AH0 T|V
34
+ ANNEX|AH0 N EH1 K S|AE1 N EH2 K S|V
35
+ ANNEXES|AH0 N EH1 K S IH0 Z|AE1 N EH2 K S IH0 Z|V
36
+ APPROPRIATE|AH0 P R OW1 P R IY0 EY2 T|AH0 P R OW1 P R IY0 AH0 T|V
37
+ APPROXIMATE|AH0 P R AA1 K S AH0 M EY2 T|AH0 P R AA1 K S AH0 M AH0 T|V
38
+ ARTICULATE|AA0 R T IH1 K Y AH0 L AH0 T|AA0 R T IH1 K Y AH0 L EY2 T|V
39
+ ASPIRATE|AE1 S P ER0 EY2 T|AE1 S P ER0 AH0 T|V
40
+ ASPIRATES|AE1 S P ER0 EY2 T S|AE1 S P ER0 AH0 T S|V
41
+ ASSOCIATE|AH0 S OW1 S IY0 EY2 T|AH0 S OW1 S IY0 AH0 T|V
42
+ ASSOCIATES|AH0 S OW1 S IY0 EY2 T S|AH0 S OW1 S IY0 AH0 T S|V
43
+ ATTRIBUTE|AH0 T R IH1 B Y UW2 T|AE1 T R IH0 B Y UW0 T|V
44
+ ATTRIBUTES|AH0 T R IH1 B Y UW2 T S|AE1 T R IH0 B Y UW0 T S|V
45
+ BATHS|B AE1 TH S|B AE1 DH Z|V
46
+ BLESSED|B L EH1 S IH0 D|B L EH1 S T|V
47
+ CERTIFICATE|S ER0 T IH1 F IH0 K AH0 T|S ER0 T IH1 F IH0 K EY2 T|V
48
+ CERTIFICATES|S ER0 T IH1 F IH0 K EY2 T S|S ER0 T IH1 F IH0 K AH0 T S|V
49
+ CLOSE|K L OW1 Z|K L OW1 S|V
50
+ CLOSER|K L OW1 Z ER0|K L OW1 S ER0|N
51
+ CLOSES|K L OW1 Z IH0 Z|K L OW1 S IH0 Z|V
52
+ COLLECT|K AH0 L EH1 K T|K AA1 L EH0 K T|V
53
+ COLLECTS|K AH0 L EH1 K T S|K AA1 L EH0 K T S|V
54
+ COMBAT|K AH0 M B AE1 T|K AA1 M B AE0 T|V
55
+ COMBATS|K AH0 M B AE1 T S|K AH1 M B AE0 T S|V
56
+ COMBINE|K AH0 M B AY1 N|K AA1 M B AY0 N|V
57
+ COMMUNE|K AH0 M Y UW1 N|K AA1 M Y UW0 N|V
58
+ COMMUNES|K AH0 M Y UW1 N Z|K AA1 M Y UW0 N Z|V
59
+ COMPACT|K AH0 M P AE1 K T|K AA1 M P AE0 K T|V
60
+ COMPACTS|K AH0 M P AE1 K T S|K AA1 M P AE0 K T S|V
61
+ COMPLEX|K AH0 M P L EH1 K S| K AA1 M P L EH0 K S|ADJ
62
+ COMPLIMENT|K AA1 M P L AH0 M EH0 N T|K AA1 M P L AH0 M AH0 N T|V
63
+ COMPLIMENTS|K AA1 M P L AH0 M EH0 N T S|K AA1 M P L AH0 M AH0 N T S|V
64
+ COMPOUND|K AH0 M P AW1 N D|K AA1 M P AW0 N D|V
65
+ COMPOUNDS|K AH0 M P AW1 N D Z|K AA1 M P AW0 N D Z|V
66
+ COMPRESS|K AH0 M P R EH1 S|K AA1 M P R EH0 S|V
67
+ COMPRESSES|K AH0 M P R EH1 S IH0 Z|K AA1 M P R EH0 S AH0 Z|V
68
+ CONCERT|K AH0 N S ER1 T|K AA1 N S ER0 T|V
69
+ CONCERTS|K AH0 N S ER1 T S|K AA1 N S ER0 T S|V
70
+ CONDUCT|K AA0 N D AH1 K T|K AA1 N D AH0 K T|V
71
+ CONFEDERATE|K AH0 N F EH1 D ER0 EY2 T|K AH0 N F EH1 D ER0 AH0 T|V
72
+ CONFEDERATES|K AH0 N F EH1 D ER0 EY2 T S|K AH0 N F EH1 D ER0 AH0 T S|V
73
+ CONFINES|K AH0 N F AY1 N Z|K AA1 N F AY2 N Z|V
74
+ CONFLICT|K AH0 N F L IH1 K T|K AA1 N F L IH0 K T|V
75
+ CONFLICTS|K AH0 N F L IH1 K T S|K AA1 N F L IH0 K T S|V
76
+ CONGLOMERATE|K AH0 N G L AA1 M ER0 EY2 T|K AH0 N G L AA1 M ER0 AH0 T|V
77
+ CONGLOMERATES|K AH0 N G L AA1 M ER0 EY2 T S|K AH0 N G L AA1 M ER0 AH0 T S|V
78
+ CONSCRIPT|K AH0 N S K R IH1 P T|K AA1 N S K R IH0 P T|V
79
+ CONSCRIPTS|K AH0 N S K R IH1 P T S|K AA1 N S K R IH0 P T S|V
80
+ CONSOLE|K AH0 N S OW1 L|K AA1 N S OW0 L|V
81
+ CONSOLES|K AH0 N S OW1 L Z|K AA1 N S OW0 L Z|V
82
+ CONSORT|K AH0 N S AO1 R T|K AA1 N S AO0 R T|V
83
+ CONSTRUCT|K AH0 N S T R AH1 K T|K AA1 N S T R AH0 K T|V
84
+ CONSTRUCTS|K AH0 N S T R AH1 K T S|K AA1 N S T R AH0 K T S|V
85
+ CONSUMMATE|K AA1 N S AH0 M EY2 T|K AA0 N S AH1 M AH0 T|V
86
+ CONTENT|K AA1 N T EH0 N T|K AH0 N T EH1 N T|N
87
+ CONTENTS|K AH0 N T EH1 N T S|K AA1 N T EH0 N T S|V
88
+ CONTEST|K AH0 N T EH1 S T|K AA1 N T EH0 S T|V
89
+ CONTESTS|K AH0 N T EH1 S T S|K AA1 N T EH0 S T S|V
90
+ CONTRACT|K AH0 N T R AE1 K T|K AA1 N T R AE2 K T|V
91
+ CONTRACTS|K AH0 N T R AE1 K T S|K AA1 N T R AE2 K T S|V
92
+ CONTRAST|K AH0 N T R AE1 S T|K AA1 N T R AE0 S T|V
93
+ CONTRASTS|K AH0 N T R AE1 S T S|K AA1 N T R AE0 S T S|V
94
+ CONVERSE|K AH0 N V ER1 S|K AA1 N V ER0 S|V
95
+ CONVERT|K AH0 N V ER1 T|K AA1 N V ER0 T|V
96
+ CONVERTS|K AH0 N V ER1 T S|K AA1 N V ER0 T S|V
97
+ CONVICT|K AH0 N V IH1 K T|K AA1 N V IH0 K T|V
98
+ CONVICTS|K AH0 N V IH1 K T S|K AA1 N V IH0 K T S|V
99
+ COORDINATE|K OW0 AO1 R D AH0 N EY2 T|K OW0 AO1 R D AH0 N AH0 T|V
100
+ COORDINATES|K OW0 AO1 R D AH0 N EY2 T S|K OW0 AO1 R D AH0 N AH0 T S|V
101
+ COUNTERBALANCE|K AW1 N T ER0 B AE2 L AH0 N S|K AW2 N T ER0 B AE1 L AH0 N S|V
102
+ COUNTERBALANCES|K AW2 N T ER0 B AE1 L AH0 N S IH0 Z|K AW1 N T ER0 B AE2 L AH0 N S IH0 Z|V
103
+ CRABBED|K R AE1 B D|K R AE1 B IH0 D|V
104
+ CROOKED|K R UH1 K T|K R UH1 K AH0 D|V
105
+ CURATE|K Y UH0 R AH1 T|K Y UH1 R AH0 T|V
106
+ CURSED|K ER1 S T|K ER1 S IH0 D|V
107
+ DECOY|D IY0 K OY1|D IY1 K OY0|V
108
+ DECOYS|D IY0 K OY1 Z|D IY1 K OY0 Z|V
109
+ DECREASE|D IH0 K R IY1 S|D IY1 K R IY2 S|V
110
+ DECREASES|D IH0 K R IY1 S IH0 Z|D IY1 K R IY2 S IH0 Z|V
111
+ DEFECT|D IH0 F EH1 K T|D IY1 F EH0 K T|V
112
+ DEFECTS|D IH0 F EH1 K T S|D IY1 F EH0 K T S|V
113
+ DEGENERATE|D IH0 JH EH1 N ER0 EY2 T|D IH0 JH EH1 N ER0 AH0 T|V
114
+ DEGENERATES|D IH0 JH EH1 N ER0 EY2 T S|D IH0 JH EH1 N ER0 AH0 T S|V
115
+ DELEGATE|D EH1 L AH0 G EY2 T|D EH1 L AH0 G AH0 T|V
116
+ DELEGATES|D EH1 L AH0 G EY2 T S|D EH1 L AH0 G AH0 T S|V
117
+ DELIBERATE|D IH0 L IH1 B ER0 EY2 T|D IH0 L IH1 B ER0 AH0 T|V
118
+ DESERT|D IH0 Z ER1 T|D EH1 Z ER0 T|V
119
+ DESERTS|D IH0 Z ER1 T S|D EH1 Z ER0 T S|V
120
+ DESOLATE|D EH1 S AH0 L EY2 T|D EH1 S AH0 L AH0 T|V
121
+ DIAGNOSES|D AY1 AH0 G N OW2 Z IY0 Z|D AY2 AH0 G N OW1 S IY0 Z|V
122
+ DICTATE|D IH0 K T EY1 T|D IH1 K T EY2 T|V
123
+ DICTATES|D IH0 K T EY1 T S|D IH1 K T EY2 T S|V
124
+ DIFFUSE|D IH0 F Y UW1 Z|D IH0 F Y UW1 S|V
125
+ DIGEST|D AY0 JH EH1 S T|D AY1 JH EH0 S T|V
126
+ DIGESTS|D AY2 JH EH1 S T S|D AY1 JH EH0 S T S|V
127
+ DISCARD|D IH0 S K AA1 R D|D IH1 S K AA0 R D|V
128
+ DISCARDS|D IH0 S K AA1 R D Z|D IH1 S K AA0 R D Z|V
129
+ DISCHARGE|D IH0 S CH AA1 R JH|D IH1 S CH AA2 R JH|V
130
+ DISCHARGES|D IH0 S CH AA1 R JH AH0 Z|D IH1 S CH AA2 R JH AH0 Z|V
131
+ DISCOUNT|D IH0 S K AW1 N T|D IH1 S K AW0 N T|V
132
+ DISCOUNTS|D IH0 S K AW1 N T S|D IH1 S K AW2 N T S|V
133
+ DISCOURSE|D IH0 S K AO1 R S|D IH1 S K AO0 R S|V
134
+ DISCOURSES|D IH0 S K AO1 R S IH0 Z|D IH1 S K AO0 R S IH0 Z|V
135
+ DOCUMENT|D AA1 K Y UW0 M EH0 N T|D AA1 K Y AH0 M AH0 N T|V
136
+ DOCUMENTS|D AA1 K Y UW0 M EH0 N T S|D AA1 K Y AH0 M AH0 N T S|V
137
+ DOGGED|D AO1 G IH0 D|D AO1 G D|V
138
+ DUPLICATE|D UW1 P L AH0 K EY2 T|D UW1 P L AH0 K AH0 T|V
139
+ DUPLICATES|D UW1 P L AH0 K EY2 T S|D UW1 P L AH0 K AH0 T S|V
140
+ EJACULATE|IH0 JH AE1 K Y UW0 L EY2 T|IH0 JH AE1 K Y UW0 L AH0 T|V
141
+ EJACULATES|IH0 JH AE1 K Y UW0 L EY2 T S|IH0 JH AE1 K Y UW0 L AH0 T S|V
142
+ ELABORATE|IH0 L AE1 B ER0 EY2 T|IH0 L AE1 B R AH0 T|V
143
+ ENTRANCE|IH0 N T R AH1 N S|EH1 N T R AH0 N S|V
144
+ ENTRANCES|IH0 N T R AH1 N S AH0 Z|EH1 N T R AH0 N S AH0 Z|V
145
+ ENVELOPE|IH0 N V EH1 L AH0 P|EH1 N V AH0 L OW2 P|V
146
+ ENVELOPES|IH0 N V EH1 L AH0 P S|EH1 N V AH0 L OW2 P S|V
147
+ ESCORT|EH0 S K AO1 R T|EH1 S K AO0 R T|V
148
+ ESCORTS|EH0 S K AO1 R T S|EH1 S K AO0 R T S|V
149
+ ESSAY|EH0 S EY1|EH1 S EY2|V
150
+ ESSAYS|EH0 S EY1 Z|EH1 S EY2 Z|V
151
+ ESTIMATE|EH1 S T AH0 M EY2 T|EH1 S T AH0 M AH0 T|V
152
+ ESTIMATES|EH1 S T AH0 M EY2 T S|EH1 S T AH0 M AH0 T S|V
153
+ EXCESS|IH0 K S EH1 S|EH1 K S EH2 S|V
154
+ EXCISE|EH0 K S AY1 S|EH1 K S AY0 Z|V
155
+ EXCUSE|IH0 K S K Y UW1 Z|IH0 K S K Y UW1 S|V
156
+ EXCUSES|IH0 K S K Y UW1 Z IH0 Z|IH0 K S K Y UW1 S IH0 Z|V
157
+ EXPATRIATE|EH0 K S P EY1 T R IY0 EY2 T|EH0 K S P EY1 T R IY0 AH0 T|V
158
+ EXPATRIATES|EH0 K S P EY1 T R IY0 EY2 T S|EH0 K S P EY1 T R IY0 AH0 T S|V
159
+ EXPLOIT|EH1 K S P L OY2 T|EH2 K S P L OY1 T|V
160
+ EXPLOITS|EH1 K S P L OY2 T S|EH2 K S P L OY1 T S|V
161
+ EXPORT|IH0 K S P AO1 R T|EH1 K S P AO0 R T|V
162
+ EXPORTS|IH0 K S P AO1 R T S|EH1 K S P AO0 R T S|V
163
+ EXTRACT|IH0 K S T R AE1 K T|EH1 K S T R AE2 K T|V
164
+ EXTRACTS|IH0 K S T R AE1 K T S|EH1 K S T R AE2 K T S|V
165
+ FERMENT|F ER0 M EH1 N T|F ER1 M EH0 N T|V
166
+ FERMENTS|F ER0 M EH1 N T S|F ER1 M EH0 N T S|V
167
+ FRAGMENT|F R AE1 G M AH0 N T|F R AE0 G M EH1 N T|V
168
+ FRAGMENTS|F R AE0 G M EH1 N T S|F R AE1 G M AH0 N T S|V
169
+ FREQUENT|F R IY1 K W EH2 N T|F R IY1 K W AH0 N T|V
170
+ GRADUATE|G R AE1 JH AH0 W EY2 T|G R AE1 JH AH0 W AH0 T|V
171
+ GRADUATES|G R AE1 JH AH0 W EY2 T S|G R AE1 JH AH0 W AH0 T S|V
172
+ HOUSE|HH AW1 Z|HH AW1 S|V
173
+ IMPACT|IH2 M P AE1 K T|IH1 M P AE0 K T|V
174
+ IMPACTS|IH2 M P AE1 K T S|IH1 M P AE0 K T S|V
175
+ IMPLANT|IH2 M P L AE1 N T|IH1 M P L AE2 N T|V
176
+ IMPLANTS|IH2 M P L AE1 N T S|IH1 M P L AE2 N T S|V
177
+ IMPLEMENT|IH1 M P L AH0 M EH0 N T|IH1 M P L AH0 M AH0 N T|V
178
+ IMPLEMENTS|IH1 M P L AH0 M EH0 N T S|IH1 M P L AH0 M AH0 N T S|V
179
+ IMPORT|IH2 M P AO1 R T|IH1 M P AO2 R T|V
180
+ IMPORTS|IH2 M P AO1 R T S|IH1 M P AO2 R T S|V
181
+ IMPRESS|IH0 M P R EH1 S|IH1 M P R EH0 S|V
182
+ IMPRINT|IH1 M P R IH0 N T|IH2 M P R IH1 N T|V
183
+ IMPRINTS|IH2 M P R IH1 N T S|IH1 M P R IH0 N T S|V
184
+ INCENSE|IH2 N S EH1 N S|IH1 N S EH2 N S|V
185
+ INCLINE|IH2 N K L AY1 N|IH1 N K L AY0 N|V
186
+ INCLINES|IH2 N K L AY1 N Z|IH1 N K L AY0 N Z|V
187
+ INCORPORATE|IH2 N K AO1 R P ER0 EY2 T|IH2 N K AO1 R P ER0 AH0 T|V
188
+ INCREASE|IH2 N K R IY1 S|IH1 N K R IY2 S|V
189
+ INCREASES|IH2 N K R IY1 S IH0 Z|IH1 N K R IY2 S IH0 Z|V
190
+ INDENT|IH2 N D EH1 N T|IH1 N D EH0 N T|V
191
+ INDENTS|IH2 N D EH1 N T S|IH1 N D EH0 N T S|V
192
+ INEBRIATE|IH2 N EH1 B R IY0 EY2 T|IH2 N EH1 B R IY0 AH0 T|V
193
+ INEBRIATES|IH2 N EH1 B R IY0 EY2 T S|IH2 N EH1 B R IY0 AH0 T S|V
194
+ INITIATE|IH2 N IH1 SH IY0 EY2 T|IH2 N IH1 SH IY0 AH0 T|V
195
+ INITIATES|IH2 N IH1 SH IY0 EY2 T S|IH2 N IH1 SH IY0 AH0 T S|V
196
+ INLAY|IH2 N L EY1|IH1 N L EY2|V
197
+ INLAYS|IH2 N L EY1 Z|IH1 N L EY2 Z|V
198
+ INSERT|IH2 N S ER1 T|IH1 N S ER2 T|V
199
+ INSERTS|IH2 N S ER1 T S|IH1 N S ER2 T S|V
200
+ INSET|IH2 N S EH1 T|IH1 N S EH2 T|V
201
+ INSETS|IH2 N S EH1 T S|IH1 N S EH2 T S|V
202
+ INSTINCT|IH2 N S T IH1 NG K T|IH1 N S T IH0 NG K T|V
203
+ INSULT|IH2 N S AH1 L T|IH1 N S AH2 L T|V
204
+ INSULTS|IH2 N S AH1 L T S|IH1 N S AH2 L T S|V
205
+ INTERCHANGE|IH2 T ER0 CH EY1 N JH|IH1 N T ER0 CH EY2 N JH|V
206
+ INTERCHANGES|IH2 T ER0 CH EY1 N JH IH0 Z|IH1 N T ER0 CH EY2 N JH IH0 Z|V
207
+ INTERDICT|IH2 N T ER0 D IH1 K T|IH1 N T ER0 D IH2 K T|V
208
+ INTERDICTS|IH2 N T ER0 D IH1 K T S|IH1 N T ER0 D IH2 K T S|V
209
+ INTERN|IH0 N T ER1 N|IH1 N T ER0 N|V
210
+ INTERNS|IH0 N T ER1 N Z|IH1 N T ER0 N Z|V
211
+ INTIMATE|IH1 N T IH0 M EY2 T|IH1 N T AH0 M AH0 T|V
212
+ INTIMATES|IH1 N T IH0 M EY2 T S|IH1 N T AH0 M AH0 T S|V
213
+ INTROVERT|IH2 N T R AO0 V ER1 T|IH1 N T R AO0 V ER2 T|V
214
+ INTROVERTS|IH2 N T R AO0 V ER1 T S|IH1 N T R AO0 V ER2 T S|V
215
+ INVERSE|IH1 N V ER0 S|IH2 N V ER1 S|V
216
+ INVITE|IH2 N V AY1 T|IH1 N V AY0 T|V
217
+ INVITES|IH2 N V AY1 T S|IH1 N V AY0 T S|V
218
+ JAGGED|JH AE1 G D|JH AE1 G IH0 D|V
219
+ LEARNED|L ER1 N IH0 D|L ER1 N D|V
220
+ LEGITIMATE|L AH0 JH IH1 T AH0 M EY2 T|L AH0 JH IH1 T AH0 M AH0 T|V
221
+ MANDATE|M AE1 N D EY2 T|M AE2 N D EY1 T|V
222
+ MISCONDUCT|M IH2 S K AA1 N D AH0 K T|M IH2 S K AA0 N D AH1 K T|V
223
+ MISPRINT|M IH2 S P R IH1 N T|M IH1 S P R IH0 N T|V
224
+ MISPRINTS|M IH2 S P R IH1 N T S|M IH1 S P R IH0 N T S|V
225
+ MISUSE|M IH0 S Y UW1 S|M IH0 S Y UW1 Z|V
226
+ MISUSES|M IH0 S Y UW1 Z IH0 Z|M IH0 S Y UW1 S IH0 Z|V
227
+ MODERATE|M AA1 D ER0 EY2 T|M AA1 D ER0 AH0 T|V
228
+ MODERATES|M AA1 D ER0 EY2 T S|M AA1 D ER0 AH0 T S|V
229
+ MOUTH|M AW1 TH|M AW1 DH|V
230
+ MOUTHS|M AW1 DH Z|M AW1 TH S|V
231
+ OBJECT|AA1 B JH EH0 K T|AH0 B JH EH1 K T|V
232
+ OBJECTS|AH0 B JH EH1 K T S|AA1 B JH EH0 K T S|V
233
+ ORNAMENT|AO1 R N AH0 M EH0 N T|AO1 R N AH0 M AH0 N T|V
234
+ ORNAMENTS|AO1 R N AH0 M EH0 N T S|AO1 R N AH0 M AH0 N T S|V
235
+ OVERCHARGE|OW2 V ER0 CH AA1 R JH|OW1 V ER0 CH AA2 R JH|V
236
+ OVERCHARGES|OW2 V ER0 CH AA1 R JH IH0 Z|OW1 V ER0 CH AA2 R JH IH0 Z|V
237
+ OVERFLOW|OW2 V ER0 F L OW1|OW1 V ER0 F L OW2|V
238
+ OVERFLOWS|OW2 V ER0 F L OW1 Z|OW1 V ER0 F L OW2 Z|V
239
+ OVERHANG|OW2 V ER0 HH AE1 NG|OW1 V ER0 HH AE2 NG|V
240
+ OVERHANGS|OW2 V ER0 HH AE1 NG Z|OW1 V ER0 HH AE2 NG Z|V
241
+ OVERHAUL|OW2 V ER0 HH AO1 L|OW1 V ER0 HH AO2 L|V
242
+ OVERHAULS|OW2 V ER0 HH AO1 L Z|OW1 V ER0 HH AO2 L Z|V
243
+ OVERLAP|OW2 V ER0 L AE1 P|OW1 V ER0 L AE2 P|V
244
+ OVERLAPS|OW2 V ER0 L AE1 P S|OW1 V ER0 L AE2 P S|V
245
+ OVERLAY|OW2 V ER0 L EY1|OW1 V ER0 L EY2|V
246
+ OVERLAYS|OW2 V ER0 L EY1 Z|OW1 V ER0 L EY2 Z|V
247
+ OVERWORK|OW2 V ER0 W ER1 K|OW1 V ER0 W ER2 K|V
248
+ PERFECT|P ER0 F EH1 K T|P ER1 F IH2 K T|V
249
+ PERFUME|P ER0 F Y UW1 M|P ER1 F Y UW0 M|V
250
+ PERFUMES|P ER0 F Y UW1 M Z|P ER1 F Y UW0 M Z|V
251
+ PERMIT|P ER0 M IH1 T|P ER1 M IH2 T|V
252
+ PERMITS|P ER0 M IH1 T S|P ER1 M IH2 T S|V
253
+ PERVERT|P ER0 V ER1 T|P ER1 V ER0 T|V
254
+ PERVERTS|P ER0 V ER1 T S|P ER1 V ER0 T S|V
255
+ PONTIFICATE|P AA0 N T IH1 F AH0 K AH0 T|P AA0 N T IH1 F AH0 K EY2 T|V
256
+ PONTIFICATES|P AA0 N T IH1 F AH0 K EY2 T S|P AA0 N T IH1 F AH0 K AH0 T S|V
257
+ PRECIPITATE|P R IH0 S IH1 P IH0 T AH0 T|P R IH0 S IH1 P IH0 T EY2 T|V
258
+ PREDICATE|P R EH1 D IH0 K AH0 T|P R EH1 D AH0 K EY2 T|V
259
+ PREDICATES|P R EH1 D AH0 K EY2 T S|P R EH1 D IH0 K AH0 T S|V
260
+ PREFIX|P R IY2 F IH1 K S|P R IY1 F IH0 K S|V
261
+ PREFIXES|P R IY2 F IH1 K S IH0 JH|P R IY1 F IH0 K S IH0 JH|V
262
+ PRESAGE|P R EH2 S IH1 JH|P R EH1 S IH0 JH|V
263
+ PRESAGES|P R EH2 S IH1 JH IH0 JH|P R EH1 S IH0 JH IH0 JH|V
264
+ PRESENT|P R IY0 Z EH1 N T|P R EH1 Z AH0 N T|V
265
+ PRESENTS|P R IY0 Z EH1 N T S|P R EH1 Z AH0 N T S|V
266
+ PROCEEDS|P R AH0 S IY1 D Z|P R OW1 S IY0 D Z|V
267
+ PROCESS|P R AO2 S EH1 S|P R AA1 S EH2 S|V
268
+ PROCESSES|P R AA1 S EH0 S AH0 Z|P R AO2 S EH1 S AH0 Z|V
269
+ PROCESSING|P R AA0 S EH1 S IH0 NG|P R AA1 S EH0 S IH0 NG|V
270
+ PRODUCE|P R AH0 D UW1 S|P R OW1 D UW0 S|V
271
+ PROGRESS|P R AH0 G R EH1 S|P R AA1 G R EH2 S|V
272
+ PROGRESSES|P R OW0 G R EH1 S AH0 Z|P R AA1 G R EH2 S AH0 Z|V
273
+ PROJECT|P R AA0 JH EH1 K T|P R AA1 JH EH0 K T|V
274
+ PROJECTS|P R AA0 JH EH1 K T S|P R AA1 JH EH0 K T S|V
275
+ PROSPECT|P R AH2 S P EH1 K T|P R AA1 S P EH0 K T|V
276
+ PROSPECTS|P R AH2 S P EH1 K T S|P R AA1 S P EH0 K T S|V
277
+ PROSTRATE|P R AA0 S T R EY1 T|P R AA1 S T R EY0 T|V
278
+ PROTEST|P R AH0 T EH1 S T|P R OW1 T EH2 S T|V
279
+ PROTESTS|P R AH0 T EH1 S T S|P R OW1 T EH2 S T S|V
280
+ PURPORT|P ER0 P AO1 R T|P ER1 P AO2 R T|V
281
+ QUADRUPLE|K W AA1 D R UW0 P AH0 L|K W AA0 D R UW1 P AH0 L|V
282
+ QUADRUPLES|K W AA0 D R UW1 P AH0 L Z|K W AA1 D R UW0 P AH0 L Z|V
283
+ RAGGED|R AE1 G D|R AE1 G AH0 D|V
284
+ RAMPAGE|R AE2 M P EY1 JH|R AE1 M P EY2 JH|V
285
+ RAMPAGES|R AE2 M P EY1 JH IH0 Z|R AE1 M P EY2 JH IH0 Z|V
286
+ READ|R IY1 D|R EH1 D|VBD
287
+ REBEL|R EH1 B AH0 L|R IH0 B EH1 L|V
288
+ REBELS|R IH0 B EH1 L Z|R EH1 B AH0 L Z|V
289
+ REBOUND|R IY0 B AW1 N D|R IY1 B AW0 N D|V
290
+ REBOUNDS|R IY0 B AW1 N D Z|R IY1 B AW0 N D Z|V
291
+ RECALL|R IH0 K AO1 L|R IY1 K AO2 L|V
292
+ RECALLS|R IH0 K AO1 L Z|R IY1 K AO2 L Z|V
293
+ RECAP|R IH0 K AE1 P|R IY1 K AE2 P|V
294
+ RECAPPED|R IH0 K AE1 P T|R IY1 K AE2 P T|V
295
+ RECAPPING|R IH0 K AE1 P IH0 NG|R IY1 K AE2 P IH0 NG|V
296
+ RECAPS|R IH0 K AE1 P S|R IY1 K AE2 P S|V
297
+ RECOUNT|R IY2 K AW1 N T| R IH1 K AW0 N T|V
298
+ RECOUNTS|R IY2 K AW1 N T S| R IH1 K AW0 N T S|V
299
+ RECORD|R IH0 K AO1 R D|R EH1 K ER0 D|V
300
+ RECORDS|R IH0 K AO1 R D Z|R EH1 K ER0 D Z|V
301
+ REFILL|R IY0 F IH1 L|R IY1 F IH0 L|V
302
+ REFILLS|R IY0 F IH1 L Z|R IY1 F IH0 L Z|V
303
+ REFIT|R IY0 F IH1 T|R IY1 F IH0 T|V
304
+ REFITS|R IY0 F IH1 T S|R IY1 F IH0 T S|V
305
+ REFRESH|R IH0 F R EH1 SH|R IH1 F R EH0 SH|V
306
+ REFUND|R IH0 F AH1 N D|R IY1 F AH2 N D|V
307
+ REFUNDS|R IH0 F AH1 N D Z|R IY1 F AH2 N D Z|V
308
+ REFUSE|R IH0 F Y UW1 Z|R EH1 F Y UW2 Z|V
309
+ REGENERATE|R IY0 JH EH1 N ER0 EY2 T|R IY0 JH EH1 N ER0 AH0 T|V
310
+ REHASH|R IY0 HH AE1 SH|R IY1 HH AE0 SH|V
311
+ REHASHES|R IY0 HH AE1 SH IH0 Z|R IY1 HH AE0 SH IH0 Z|V
312
+ REINCARNATE|R IY2 IH0 N K AA1 R N EY2 T|R IY2 IH0 N K AA1 R N AH0 T|V
313
+ REJECT|R IH0 JH EH1 K T|R IY1 JH EH0 K T|V
314
+ REJECTS|R IH0 JH EH1 K T S|R IY1 JH EH0 K T S|V
315
+ RELAY|R IY2 L EY1|R IY1 L EY2|V
316
+ RELAYING|R IY2 L EY1 IH0 NG|R IY1 L EY2 IH0 NG|V
317
+ RELAYS|R IY2 L EY1 Z|R IY1 L EY2 Z|V
318
+ REMAKE|R IY2 M EY1 K|R IY1 M EY0 K|V
319
+ REMAKES|R IY2 M EY1 K S|R IY1 M EY0 K S|V
320
+ REPLAY|R IY0 P L EY1|R IY1 P L EY0|V
321
+ REPLAYS|R IY0 P L EY1 Z|R IY1 P L EY0 Z|V
322
+ REPRINT|R IY0 P R IH1 N T|R IY1 P R IH0 N T|V
323
+ REPRINTS|R IY0 P R IH1 N T S|R IY1 P R IH0 N T S|V
324
+ RERUN|R IY2 R AH1 N|R IY1 R AH0 N|V
325
+ RERUNS|R IY2 R AH1 N Z|R IY1 R AH0 N Z|V
326
+ RESUME|R IY0 Z UW1 M|R EH1 Z AH0 M EY2|V
327
+ RETAKE|R IY0 T EY1 K|R IY1 T EY0 K|V
328
+ RETAKES|R IY0 T EY1 K S|R IY1 T EY0 K S|V
329
+ RETHINK|R IY2 TH IH1 NG K|R IY1 TH IH0 NG K|V
330
+ RETHINKS|R IY2 TH IH1 NG K S|R IY1 TH IH0 NG K S|V
331
+ RETREAD|R IY2 T R EH1 D|R IY1 T R EH0 D|V
332
+ RETREADS|R IY2 T R EH1 D Z|R IY1 T R EH0 D Z|V
333
+ REWRITE|R IY0 R AY1 T|R IY1 R AY2 T|V
334
+ REWRITES|R IY0 R AY1 T S|R IY1 R AY2 T S|V
335
+ SEGMENT|S EH1 G M AH0 N T|S EH2 G M EH1 N T|V
336
+ SEGMENTS|S EH2 G M EH1 N T S|S EH1 G M AH0 N T S|V
337
+ SEPARATE|S EH1 P ER0 EY2 T|S EH1 P ER0 IH0 T|V
338
+ SEPARATES|S EH1 P ER0 EY2 T S|S EH1 P ER0 IH0 T S|V
339
+ SUBCONTRACT|S AH0 B K AA1 N T R AE2 K T|S AH2 B K AA0 N T R AE1 K T|V
340
+ SUBCONTRACTS|S AH2 B K AA0 N T R AE1 K T S|S AH0 B K AA1 N T R AE2 K T S|V
341
+ SUBJECT|S AH0 B JH EH1 K T|S AH1 B JH IH0 K T|V
342
+ SUBJECTS|S AH0 B JH EH1 K T S|S AH1 B JH IH0 K T S|V
343
+ SUBORDINATE|S AH0 B AO1 R D AH0 N EY2 T|S AH0 B AO1 R D AH0 N AH0 T|V
344
+ SUBORDINATES|S AH0 B AO1 R D AH0 N EY2 T S|S AH0 B AO1 R D AH0 N AH0 T S|V
345
+ SUPPLEMENT|S AH1 P L AH0 M EH0 N T|S AH1 P L AH0 M AH0 N T|V
346
+ SUPPLEMENTS|S AH1 P L AH0 M EH0 N T S|S AH1 P L AH0 M AH0 N T S|V
347
+ SURMISE|S ER0 M AY1 Z|S ER1 M AY0 Z|V
348
+ SURMISES|S ER0 M AY1 Z IH0 Z|S ER1 M AY0 Z IH0 Z|V
349
+ SURVEY|S ER0 V EY1|S ER1 V EY2|V
350
+ SURVEYS|S ER0 V EY1 Z|S ER1 V EY2 Z|V
351
+ SUSPECT|S AH0 S P EH1 K T|S AH1 S P EH2 K T|V
352
+ SUSPECTS|S AH0 S P EH1 K T S|S AH1 S P EH2 K T S|V
353
+ SYNDICATE|S IH1 N D AH0 K EY2 T|S IH1 N D IH0 K AH0 T|V
354
+ SYNDICATES|S IH1 N D IH0 K EY2 T S|S IH1 N D IH0 K AH0 T S|V
355
+ TORMENT|T AO1 R M EH2 N T|T AO0 R M EH1 N T|V
356
+ TRANSFER|T R AE0 N S F ER1|T R AE1 N S F ER0|V
357
+ TRANSFERS|T R AE0 N S F ER1 Z|T R AE1 N S F ER0 Z|V
358
+ TRANSPLANT|T R AE0 N S P L AE1 N T|T R AE1 N S P L AE0 N T|V
359
+ TRANSPLANTS|T R AE0 N S P L AE1 N T S|T R AE1 N S P L AE0 N T S|V
360
+ TRANSPORT|T R AE0 N S P AO1 R T|T R AE1 N S P AO0 R T|V
361
+ TRANSPORTS|T R AE0 N S P AO1 R T S|T R AE1 N S P AO0 R T S|V
362
+ TRIPLICATE|T R IH1 P L IH0 K EY2 T|T R IH1 P L IH0 K AH0 T|V
363
+ TRIPLICATES|T R IH1 P L IH0 K EY2 T S|T R IH1 P L IH0 K AH0 T S|V
364
+ UNDERCUT|AH2 N D ER0 K AH1 T|AH1 N D ER0 K AH2 T|V
365
+ UNDERESTIMATE|AH1 N D ER0 EH1 S T AH0 M EY2 T|AH1 N D ER0 EH1 S T AH0 M AH0 T|V
366
+ UNDERESTIMATES|AH1 N D ER0 EH1 S T AH0 M EY2 T S|AH1 N D ER0 EH1 S T AH0 M AH0 T S|V
367
+ UNDERLINE|AH2 N D ER0 L AY1 N|AH1 N D ER0 L AY2 N|V
368
+ UNDERLINES|AH2 N D ER0 L AY1 N Z|AH1 N D ER0 L AY2 N Z|V
369
+ UNDERTAKING|AH2 N D ER0 T EY1 K IH0 NG|AH1 N D ER0 T EY2 K IH0 NG|V
370
+ UNDERTAKINGS|AH2 N D ER0 T EY1 K IH0 NG Z|AH1 N D ER0 T EY2 K IH0 NG Z|V
371
+ UNUSED|AH0 N Y UW1 Z D|AH0 N Y UW1 S T|V
372
+ UPGRADE|AH0 P G R EY1 D|AH1 P G R EY0 D|V
373
+ UPGRADES|AH0 P G R EY1 D Z|AH1 P G R EY0 D Z|V
374
+ UPLIFT|AH2 P L IH1 F T|AH1 P L IH0 F T|V
375
+ UPSET|AH0 P S EH1 T|AH1 P S EH2 T|V
376
+ UPSETS|AH0 P S EH1 T S|AH1 P S EH2 T S|V
377
+ USE|Y UW1 Z|Y UW1 S|V
378
+ USED|Y UW1 Z D|Y UW1 S T|VBN
379
+ USES|Y UW1 Z IH0 Z|Y UW1 S IH0 Z|V
dataset/google.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math, os, re, sys
2
+ from pathlib import Path
3
+ import numpy as np
4
+ import pandas as pd
5
+ from multiprocessing import Pool
6
+ from scipy.io import wavfile
7
+ import tensorflow as tf
8
+
9
+ from tensorflow.keras.utils import Sequence, OrderedEnqueuer
10
+ from tensorflow.keras import layers
11
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
12
+
13
+ sys.path.append(os.path.dirname(__file__))
14
+ from g2p.g2p_en.g2p import G2p
15
+
16
+ import warnings
17
+ warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
18
+ np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
19
+
20
+ class GoogleCommandsDataloader(Sequence):
21
+ def __init__(self,
22
+ batch_size,
23
+ fs = 16000,
24
+ wav_dir='/home/DB/google_speech_commands',
25
+ target_list=['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go'],
26
+ features='g2p_embed', # phoneme, g2p_embed, both ...
27
+ shuffle=True,
28
+ testset_only=False,
29
+ pkl=None,
30
+ ):
31
+
32
+ phonemes = ["<pad>", ] + ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
33
+ 'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH',
34
+ 'D', 'DH', 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1',
35
+ 'EY2', 'F', 'G', 'HH', 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2',
36
+ 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0',
37
+ 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH0', 'UH1',
38
+ 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH',
39
+ ' ']
40
+
41
+ self.p2idx = {p: idx for idx, p in enumerate(phonemes)}
42
+ self.idx2p = {idx: p for idx, p in enumerate(phonemes)}
43
+
44
+ self.batch_size = batch_size
45
+ self.fs = fs
46
+ self.wav_dir = wav_dir
47
+ self.target_list = [x.lower() for x in target_list]
48
+ self.testset_only = testset_only
49
+ self.features = features
50
+ self.shuffle = shuffle
51
+ self.pkl = pkl
52
+ self.nPhoneme = len(phonemes)
53
+ self.g2p = G2p()
54
+
55
+ self.__prep__()
56
+ self.on_epoch_end()
57
+
58
+ def __prep__(self):
59
+ self.data = pd.DataFrame(columns=['wav', 'text', 'duration', 'label'])
60
+
61
+ if (self.pkl is not None) and (os.path.isfile(self.pkl)):
62
+ print(">> Load dataset from {}".format(self.pkl))
63
+ self.data = pd.read_pickle(self.pkl)
64
+ else:
65
+ print(">> Make dataset from {}".format(self.wav_dir))
66
+ target_dict = {}
67
+ idx = 0
68
+ for target in self.target_list:
69
+ print(">> Extract from {}".format(target))
70
+ if self.testset_only:
71
+ test_list = os.path.join(self.wav_dir, 'testing_list.txt')
72
+ with open(test_list, "r") as f:
73
+ wav_list = f.readlines()
74
+ wav_list = [os.path.join(self.wav_dir, x.strip()) for x in wav_list]
75
+ wav_list = [x for x in wav_list if target == x.split('/')[-2]]
76
+ else:
77
+ wav_list = [str(x) for x in Path(os.path.join(self.wav_dir, target)).rglob('*.wav')]
78
+ for wav in wav_list:
79
+ anchor_text = wav.split('/')[-2].lower()
80
+ duration = float(wavfile.read(wav)[1].shape[-1])/self.fs
81
+ for comparison_text in self.target_list:
82
+ label = 1 if anchor_text == comparison_text else 0
83
+ target_dict[idx] = {
84
+ 'wav': wav,
85
+ 'text': comparison_text,
86
+ 'duration': duration,
87
+ 'label': label
88
+ }
89
+ idx += 1
90
+ self.data = self.data.append(pd.DataFrame.from_dict(target_dict, 'index'), ignore_index=True)
91
+
92
+ # g2p & p2idx by g2p_en package
93
+ print(">> Convert word to phoneme")
94
+ self.data['phoneme'] = self.data['text'].apply(lambda x: self.g2p(re.sub(r"[^a-zA-Z0-9]+", ' ', x)))
95
+ print(">> Convert phoneme to index")
96
+ self.data['pIndex'] = self.data['phoneme'].apply(lambda x: [self.p2idx[t] for t in x])
97
+ print(">> Compute phoneme embedding")
98
+ self.data['g2p_embed'] = self.data['text'].apply(lambda x: self.g2p.embedding(x))
99
+
100
+ if (self.pkl is not None) and (not os.path.isfile(self.pkl)):
101
+ self.data.to_pickle(self.pkl)
102
+
103
+ # Get longest data
104
+ self.data = self.data.sort_values(by='duration').reset_index(drop=True)
105
+ self.wav_list = self.data['wav'].values
106
+ self.idx_list = self.data['pIndex'].values
107
+ self.emb_list = self.data['g2p_embed'].values
108
+ self.lab_list = self.data['label'].values
109
+
110
+ # Set dataloader params.
111
+ self.len = len(self.data)
112
+ self.maxlen_t = int((int(self.data['text'].apply(lambda x: len(x)).max() / 10) + 1) * 10)
113
+ self.maxlen_a = int((int(self.data['duration'].values[-1] / 0.5) + 1 ) * self.fs / 2)
114
+
115
+ def __len__(self):
116
+ # return total batch-wise length
117
+ return math.ceil(self.len / self.batch_size)
118
+
119
+ def _load_wav(self, wav):
120
+ return np.array(wavfile.read(wav)[1]).astype(np.float32) / 32768.0
121
+
122
+ def __getitem__(self, idx):
123
+ # chunking
124
+ indices = self.indices[idx * self.batch_size : (idx + 1) * self.batch_size]
125
+
126
+ # load inputs
127
+ batch_x = [np.array(wavfile.read(self.wav_list[i])[1]).astype(np.float32) / 32768.0 for i in indices]
128
+ if self.features == 'both':
129
+ batch_p = [np.array(self.idx_list[i]).astype(np.int32) for i in indices]
130
+ batch_e = [np.array(self.emb_list[i]).astype(np.float32) for i in indices]
131
+ else:
132
+ if self.features == 'phoneme':
133
+ batch_y = [np.array(self.idx_list[i]).astype(np.int32) for i in indices]
134
+ elif self.features == 'g2p_embed':
135
+ batch_y = [np.array(self.emb_list[i]).astype(np.float32) for i in indices]
136
+ # load outputs
137
+ batch_z = [np.array([self.lab_list[i]]).astype(np.float32) for i in indices]
138
+
139
+ # padding and masking
140
+ pad_batch_x = pad_sequences(np.array(batch_x), maxlen=self.maxlen_a, value=0.0, padding='post', dtype=batch_x[0].dtype)
141
+ if self.features == 'both':
142
+ pad_batch_p = pad_sequences(np.array(batch_p), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_p[0].dtype)
143
+ pad_batch_e = pad_sequences(np.array(batch_e), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_e[0].dtype)
144
+ else:
145
+ pad_batch_y = pad_sequences(np.array(batch_y), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_y[0].dtype)
146
+ pad_batch_z = pad_sequences(np.array(batch_z), value=0.0, padding='post', dtype=batch_z[0].dtype)
147
+
148
+ if self.features == 'both':
149
+ return pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z
150
+ else:
151
+ return pad_batch_x, pad_batch_y, pad_batch_z
152
+
153
+ def on_epoch_end(self):
154
+ self.indices = np.arange(self.len)
155
+ if self.shuffle == True:
156
+ np.random.shuffle(self.indices)
157
+
158
+ def convert_sequence_to_dataset(dataloader):
159
+ def data_generator():
160
+ for i in range(dataloader.__len__()):
161
+ if dataloader.features == 'both':
162
+ pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z = dataloader[i]
163
+ yield pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z
164
+ else:
165
+ pad_batch_x, pad_batch_y, pad_batch_z = dataloader[i]
166
+ yield pad_batch_x, pad_batch_y, pad_batch_z
167
+
168
+ if dataloader.features == 'both':
169
+ data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
170
+ tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
171
+ tf.TensorSpec(shape=(None, dataloader.maxlen_t), dtype=tf.int32),
172
+ tf.TensorSpec(shape=(None, dataloader.maxlen_t, 256), dtype=tf.float32),
173
+ tf.TensorSpec(shape=(None, 1), dtype=tf.float32),)
174
+ )
175
+ else:
176
+ data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
177
+ tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
178
+ tf.TensorSpec(shape=(None, dataloader.maxlen_t) if dataloader.features == 'phoneme' else (None, dataloader.maxlen_t, 256),
179
+ dtype=tf.int32 if dataloader.features == 'phoneme' else tf.float32),
180
+ tf.TensorSpec(shape=(None, 1), dtype=tf.float32),)
181
+ )
182
+ # data_dataset = data_dataset.cache()
183
+ data_dataset = data_dataset.prefetch(1)
184
+
185
+ return data_dataset
186
+
187
+ if __name__ == '__main__':
188
+ dataloader = GoogleCommandsDataloader(2048, testset_only=True, pkl='/home/DB/google_speech_commands/google_testset.pkl', features='g2p_embed')
dataset/google_infe202405.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math, os, re, sys
2
+ from pathlib import Path
3
+ import numpy as np
4
+ import pandas as pd
5
+ from multiprocessing import Pool
6
+ from scipy.io import wavfile
7
+ import tensorflow as tf
8
+
9
+ from tensorflow.keras.utils import Sequence, OrderedEnqueuer
10
+ from tensorflow.keras import layers
11
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
12
+
13
+ sys.path.append(os.path.dirname(__file__))
14
+ from g2p.g2p_en.g2p import G2p
15
+
16
+ import warnings
17
+ warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
18
+ np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
19
+
20
+ class GoogleCommandsDataloader(Sequence):
21
+ def __init__(self,
22
+ batch_size,
23
+ fs = 16000,
24
+ wav_dir='/home/DB/kws_google/data2',
25
+ target_list=['bed','three','bird','cat','dog','eight','five','four','happy','house','marvin','nine',
26
+ 'one','seven','sheila','six','tree','two','wow','zero'],
27
+ features='g2p_embed', # phoneme, g2p_embed, both ...
28
+ shuffle=True,
29
+ testset_only=False,
30
+ pkl=None,
31
+ ):
32
+
33
+ phonemes = ["<pad>", ] + ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
34
+ 'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH',
35
+ 'D', 'DH', 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1',
36
+ 'EY2', 'F', 'G', 'HH', 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2',
37
+ 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0',
38
+ 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH0', 'UH1',
39
+ 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH',
40
+ ' ']
41
+
42
+ self.p2idx = {p: idx for idx, p in enumerate(phonemes)}
43
+ self.idx2p = {idx: p for idx, p in enumerate(phonemes)}
44
+
45
+ self.batch_size = batch_size
46
+ self.fs = fs
47
+ self.wav_dir = wav_dir
48
+ self.target_list = [x.lower() for x in target_list]
49
+ self.testset_only = testset_only
50
+ self.features = features
51
+ self.shuffle = shuffle
52
+ self.pkl = pkl
53
+ self.nPhoneme = len(phonemes)
54
+ self.g2p = G2p()
55
+
56
+ self.__prep__()
57
+ self.on_epoch_end()
58
+
59
+ def __prep__(self):
60
+ self.data = pd.DataFrame(columns=['wav', 'text', 'duration', 'label'])
61
+
62
+ if (self.pkl is not None) and (os.path.isfile(self.pkl)):
63
+ print(">> Load dataset from {}".format(self.pkl))
64
+ self.data = pd.read_pickle(self.pkl)
65
+ else:
66
+ print(">> Make dataset from {}".format(self.wav_dir))
67
+ target_dict = {}
68
+ idx = 0
69
+ for target in self.target_list:
70
+ print(">> Extract from {}".format(target))
71
+ if self.testset_only:
72
+ test_list = os.path.join(self.wav_dir, 'testing_list.txt')
73
+ with open(test_list, "r") as f:
74
+ wav_list = f.readlines()
75
+ wav_list = [os.path.join(self.wav_dir, x.strip()) for x in wav_list]
76
+ wav_list = [x for x in wav_list if target == x.split('/')[-2]]
77
+ else:
78
+ wav_list = [str(x) for x in Path(os.path.join(self.wav_dir, target)).rglob('*.wav')]
79
+
80
+ for wav in wav_list:
81
+ anchor_text = wav.split('/')[-2].lower()
82
+ duration = float(wavfile.read(wav)[1].shape[-1])/self.fs
83
+ for comparison_text in self.target_list:
84
+ label = 1 if anchor_text == comparison_text else 0
85
+ target_dict[idx] = {
86
+ 'wav': wav,
87
+ 'text': comparison_text,
88
+ 'duration': duration,
89
+ 'label': label
90
+ }
91
+ idx += 1
92
+ self.data = self.data.append(pd.DataFrame.from_dict(target_dict, 'index'), ignore_index=True)
93
+
94
+ # g2p & p2idx by g2p_en package
95
+ print(">> Convert word to phoneme")
96
+ self.data['phoneme'] = self.data['text'].apply(lambda x: self.g2p(re.sub(r"[^a-zA-Z0-9]+", ' ', x)))
97
+ print(">> Convert phoneme to index")
98
+ self.data['pIndex'] = self.data['phoneme'].apply(lambda x: [self.p2idx[t] for t in x])
99
+ print(">> Compute phoneme embedding")
100
+ self.data['g2p_embed'] = self.data['text'].apply(lambda x: self.g2p.embedding(x))
101
+
102
+ if (self.pkl is not None) and (not os.path.isfile(self.pkl)):
103
+ self.data.to_pickle(self.pkl)
104
+
105
+
106
+ # Get longest data
107
+ self.wav_list = self.data['wav'].values
108
+ self.idx_list = self.data['pIndex'].values
109
+ self.emb_list = self.data['g2p_embed'].values
110
+ self.lab_list = self.data['label'].values
111
+ self.data = self.data.sort_values(by='duration').reset_index(drop=True)
112
+
113
+ # Set dataloader params.
114
+ self.len = len(self.data)
115
+ self.maxlen_t = int((int(self.data['text'].apply(lambda x: len(x)).max() / 10) + 1) * 10)
116
+ self.maxlen_a = int((int(self.data['duration'].values[-1] / 0.5) + 1 ) * self.fs / 2)
117
+
118
+ def __len__(self):
119
+ # return total batch-wise length
120
+ return math.ceil(self.len / self.batch_size)
121
+
122
+ def _load_wav(self, wav):
123
+ return np.array(wavfile.read(wav)[1]).astype(np.float32) / 32768.0
124
+
125
+ def __getitem__(self, idx):
126
+ # chunking
127
+ indices = self.indices[idx * self.batch_size : (idx + 1) * self.batch_size]
128
+
129
+ # load inputs
130
+ batch_x = [np.array(wavfile.read(self.wav_list[i])[1]).astype(np.float32) / 32768.0 for i in indices]
131
+ if self.features == 'both':
132
+ batch_p = [np.array(self.idx_list[i]).astype(np.int32) for i in indices]
133
+ batch_e = [np.array(self.emb_list[i]).astype(np.float32) for i in indices]
134
+ else:
135
+ if self.features == 'phoneme':
136
+ batch_y = [np.array(self.idx_list[i]).astype(np.int32) for i in indices]
137
+ elif self.features == 'g2p_embed':
138
+ batch_y = [np.array(self.emb_list[i]).astype(np.float32) for i in indices]
139
+ # load outputs
140
+ batch_z = [np.array([self.lab_list[i]]).astype(np.float32) for i in indices]
141
+
142
+ # padding and masking
143
+ pad_batch_x = pad_sequences(np.array(batch_x), maxlen=self.maxlen_a, value=0.0, padding='post', dtype=batch_x[0].dtype)
144
+ if self.features == 'both':
145
+ pad_batch_p = pad_sequences(np.array(batch_p), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_p[0].dtype)
146
+ pad_batch_e = pad_sequences(np.array(batch_e), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_e[0].dtype)
147
+ else:
148
+ pad_batch_y = pad_sequences(np.array(batch_y), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_y[0].dtype)
149
+ pad_batch_z = pad_sequences(np.array(batch_z), value=0.0, padding='post', dtype=batch_z[0].dtype)
150
+
151
+ if self.features == 'both':
152
+ return pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z
153
+ else:
154
+ return pad_batch_x, pad_batch_y, pad_batch_z
155
+
156
+ def on_epoch_end(self):
157
+ self.indices = np.arange(self.len)
158
+ # if self.shuffle == True:
159
+ # np.random.shuffle(self.indices)
160
+
161
+ def convert_sequence_to_dataset(dataloader):
162
+ def data_generator():
163
+ for i in range(dataloader.__len__()):
164
+ if dataloader.features == 'both':
165
+ pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z = dataloader[i]
166
+ yield pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z
167
+ else:
168
+ pad_batch_x, pad_batch_y, pad_batch_z = dataloader[i]
169
+ yield pad_batch_x, pad_batch_y, pad_batch_z
170
+
171
+ if dataloader.features == 'both':
172
+ data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
173
+ tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
174
+ tf.TensorSpec(shape=(None, dataloader.maxlen_t), dtype=tf.int32),
175
+ tf.TensorSpec(shape=(None, dataloader.maxlen_t, 256), dtype=tf.float32),
176
+ tf.TensorSpec(shape=(None, 1), dtype=tf.float32),)
177
+ )
178
+ else:
179
+ data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
180
+ tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
181
+ tf.TensorSpec(shape=(None, dataloader.maxlen_t) if dataloader.features == 'phoneme' else (None, dataloader.maxlen_t, 256),
182
+ dtype=tf.int32 if dataloader.features == 'phoneme' else tf.float32),
183
+ tf.TensorSpec(shape=(None, 1), dtype=tf.float32),)
184
+ )
185
+ # data_dataset = data_dataset.cache()
186
+ # data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=output_signature)
187
+ data_dataset = data_dataset.prefetch(1)
188
+
189
+ return data_dataset
190
+
191
+ if __name__ == '__main__':
192
+ dataloader = GoogleCommandsDataloader(2048, testset_only=True, pkl='/home/DB/google_speech_commands/google_testset.pkl', features='g2p_embed')
dataset/libriphrase.py ADDED
@@ -0,0 +1,331 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math, os, re, sys
2
+ from pathlib import Path
3
+ import numpy as np
4
+ import pandas as pd
5
+ import Levenshtein
6
+ from multiprocessing import Pool
7
+ from scipy.io import wavfile
8
+ import tensorflow as tf
9
+
10
+ from tensorflow.keras.utils import Sequence, OrderedEnqueuer
11
+ from tensorflow.keras import layers
12
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
13
+
14
+ sys.path.append(os.path.dirname(__file__))
15
+ from g2p.g2p_en.g2p import G2p
16
+
17
+ import warnings
18
+ warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
19
+ np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
20
+
21
+ class LibriPhraseDataloader(Sequence):
22
+ def __init__(self,
23
+ batch_size,
24
+ fs = 16000,
25
+ wav_dir='/homw/DB/LibriPhrase/wav_dir',
26
+ noise_dir='/homw/DB/noise',
27
+ csv_dir='/homw/DB/LibriPhrase/data',
28
+ train_csv = ['train_100h', 'train_360h'],
29
+ test_csv = ['train_500h',],
30
+ types='both', # easy, hard
31
+ features='g2p_embed', # phoneme, g2p_embed, both ...
32
+ train=True,
33
+ shuffle=True,
34
+ pkl=None,
35
+ edit_dist=False,
36
+ ):
37
+
38
+ phonemes = ["<pad>", ] + ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
39
+ 'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH',
40
+ 'D', 'DH', 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1',
41
+ 'EY2', 'F', 'G', 'HH', 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2',
42
+ 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0',
43
+ 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH0', 'UH1',
44
+ 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH',
45
+ ' ']
46
+
47
+ self.p2idx = {p: idx for idx, p in enumerate(phonemes)}
48
+ self.idx2p = {idx: p for idx, p in enumerate(phonemes)}
49
+
50
+ self.batch_size = batch_size
51
+ self.fs = fs
52
+ self.wav_dir = wav_dir
53
+ self.csv_dir = csv_dir
54
+ self.noise_dir = noise_dir
55
+ self.train_csv = train_csv
56
+ self.test_csv = test_csv
57
+ self.types = types
58
+ self.features = features
59
+ self.train = train
60
+ self.shuffle = shuffle
61
+ self.pkl = pkl
62
+ self.edit_dist = edit_dist
63
+ self.nPhoneme = len(phonemes)
64
+ self.g2p = G2p()
65
+
66
+ self.__prep__()
67
+ self.on_epoch_end()
68
+
69
+ def __prep__(self):
70
+ if self.train:
71
+ print(">> Preparing noise DB")
72
+ noise_list = [str(x) for x in Path(self.noise_dir).rglob('*.wav')]
73
+ self.noise = np.array([])
74
+ for noise in noise_list:
75
+ fs, data = wavfile.read(noise)
76
+ assert fs == self.fs, ">> Error : Un-match sampling freq.\n{} -> {}".format(noise, fs)
77
+ data = data.astype(np.float32) / 32768.0
78
+ data = (data / np.max(data)) * 0.5
79
+ self.noise = np.append(self.noise, data)
80
+
81
+ self.data = pd.DataFrame(columns=['wav_label', 'wav', 'text', 'duration', 'label', 'type'])
82
+
83
+ if (self.pkl is not None) and (os.path.isfile(self.pkl)):
84
+ print(">> Load dataset from {}".format(self.pkl))
85
+ self.data = pd.read_pickle(self.pkl)
86
+ else:
87
+ for db in self.train_csv if self.train else self.test_csv:
88
+ csv_list = [str(x) for x in Path(self.csv_dir).rglob('*' + db + '*word*')]
89
+ for n_word in csv_list:
90
+ print(">> processing : {} ".format(n_word))
91
+ df = pd.read_csv(n_word)
92
+ # Split train dataset to match & unmatch case
93
+ anc_pos = df[['anchor_text', 'anchor', 'anchor_text', 'anchor_dur']]
94
+ anc_neg = df[['anchor_text', 'anchor', 'comparison_text', 'anchor_dur', 'target', 'type']]
95
+ com_pos = df[['comparison_text', 'comparison', 'comparison_text', 'comparison_dur']]
96
+ com_neg = df[['comparison_text', 'comparison', 'anchor_text', 'comparison_dur', 'target', 'type']]
97
+ anc_pos.columns = ['wav_label', 'anchor', 'anchor_text', 'anchor_dur']
98
+ com_pos.columns = ['wav_label', 'comparison', 'comparison_text', 'comparison_dur']
99
+ anc_pos['label'] = 1
100
+ anc_pos['type'] = df['type']
101
+ com_pos['label'] = 1
102
+ com_pos['type'] = df['type']
103
+ # Concat
104
+ self.data = self.data.append(anc_pos.rename(columns={y: x for x, y in zip(self.data.columns, anc_pos.columns)}), ignore_index=True)
105
+ self.data = self.data.append(anc_neg.rename(columns={y: x for x, y in zip(self.data.columns, anc_neg.columns)}), ignore_index=True)
106
+ self.data = self.data.append(com_pos.rename(columns={y: x for x, y in zip(self.data.columns, com_pos.columns)}), ignore_index=True)
107
+ self.data = self.data.append(com_neg.rename(columns={y: x for x, y in zip(self.data.columns, com_neg.columns)}), ignore_index=True)
108
+
109
+ # Append wav directory path
110
+ self.data['wav'] = self.data['wav'].apply(lambda x: os.path.join(self.wav_dir, x))
111
+ # g2p & p2idx by g2p_en package
112
+ print(">> Convert word to phoneme")
113
+ self.data['phoneme'] = self.data['text'].apply(lambda x: self.g2p(re.sub(r"[^a-zA-Z0-9]+", ' ', x)))
114
+ print(">> Convert speech word to phoneme")
115
+ self.data['wav_phoneme'] = self.data['wav_label'].apply(lambda x: self.g2p(re.sub(r"[^a-zA-Z0-9]+", ' ', x)))
116
+ print(">> Convert phoneme to index")
117
+ self.data['pIndex'] = self.data['phoneme'].apply(lambda x: [self.p2idx[t] for t in x])
118
+ print(">> Convert speech phoneme to index")
119
+ self.data['wav_pIndex'] = self.data['wav_phoneme'].apply(lambda x: [self.p2idx[t] for t in x])
120
+ print(">> Compute phoneme embedding")
121
+ self.data['g2p_embed'] = self.data['text'].apply(lambda x: self.g2p.embedding(x))
122
+ print(">> Calucate Edit distance ratio")
123
+ self.data['dist'] = self.data.apply(lambda x: Levenshtein.ratio(re.sub(r"[^a-zA-Z0-9]+", ' ', x['wav_label']), re.sub(r"[^a-zA-Z0-9]+", ' ', x['text'])), axis=1)
124
+
125
+ if (self.pkl is not None) and (not os.path.isfile(self.pkl)):
126
+ self.data.to_pickle(self.pkl)
127
+
128
+ # Masking dataset type
129
+ if self.types == 'both':
130
+ pass
131
+ elif self.types == 'easy':
132
+ self.data = self.data.loc[self.data['type'] == 'diffspk_easyneg']
133
+ elif self.types == 'hard':
134
+ self.data = self.data.loc[self.data['type'] == 'diffspk_hardneg']
135
+
136
+ # Get longest data
137
+ self.data = self.data.sort_values(by='duration').reset_index(drop=True)
138
+ self.wav_list = self.data['wav'].values
139
+ self.idx_list = self.data['pIndex'].values
140
+ self.sIdx_list = self.data['wav_pIndex'].values
141
+ self.emb_list = self.data['g2p_embed'].values
142
+ self.lab_list = self.data['label'].values
143
+ if self.edit_dist:
144
+ self.dist_list = self.data['dist'].values
145
+
146
+ # Set dataloader params.
147
+ self.len = len(self.data)
148
+ self.maxlen_t = int((int(self.data['text'].apply(lambda x: len(x)).max() / 10) + 1) * 10)
149
+ self.maxlen_a = int((int(self.data['duration'].values[-1] / 0.5) + 1 ) * self.fs / 2)
150
+ self.maxlen_l = int((int(self.data['wav_label'].apply(lambda x: len(x)).max() / 10) + 1) * 10)
151
+
152
+ def __len__(self):
153
+ # return total batch-wise length
154
+ return math.ceil(self.len / self.batch_size)
155
+
156
+ def _load_wav(self, wav):
157
+ return np.array(wavfile.read(wav)[1]).astype(np.float32) / 32768.0
158
+
159
+ def _mixing_snr(self, clean, snr=[5, 15]):
160
+ def _cal_adjusted_rms(clean_rms, snr):
161
+ a = float(snr) / 20
162
+ noise_rms = clean_rms / (10**a)
163
+ return noise_rms
164
+
165
+ def _cal_rms(amp):
166
+ return np.sqrt(np.mean(np.square(amp), axis=-1))
167
+
168
+ start = np.random.randint(0, len(self.noise)-len(clean))
169
+ divided_noise = self.noise[start: start + len(clean)]
170
+ clean_rms = _cal_rms(clean)
171
+ noise_rms = _cal_rms(divided_noise)
172
+ adj_noise_rms = _cal_adjusted_rms(clean_rms, np.random.randint(snr[0], snr[1]))
173
+
174
+ adj_noise_amp = divided_noise * (adj_noise_rms / (noise_rms + 1e-7))
175
+ noisy = clean + adj_noise_amp
176
+
177
+ if np.max(noisy) > 1:
178
+ noisy = noisy / np.max(noisy)
179
+
180
+ return noisy
181
+
182
+ def __getitem__(self, idx):
183
+ # chunking
184
+ indices = self.indices[idx * self.batch_size : (idx + 1) * self.batch_size]
185
+
186
+ # load inputs
187
+ batch_x = [np.array(wavfile.read(self.wav_list[i])[1]).astype(np.float32) / 32768.0 for i in indices]
188
+ if self.features == 'both':
189
+ batch_p = [np.array(self.idx_list[i]).astype(np.int32) for i in indices]
190
+ batch_e = [np.array(self.emb_list[i]).astype(np.float32) for i in indices]
191
+ else:
192
+ if self.features == 'phoneme':
193
+ batch_y = [np.array(self.idx_list[i]).astype(np.int32) for i in indices]
194
+ elif self.features == 'g2p_embed':
195
+ batch_y = [np.array(self.emb_list[i]).astype(np.float32) for i in indices]
196
+ # load outputs
197
+ batch_z = [np.array([self.lab_list[i]]).astype(np.float32) for i in indices]
198
+ batch_l = [np.array(self.sIdx_list[i]).astype(np.int32) for i in indices]
199
+ batch_t = [np.array(self.idx_list[i]).astype(np.int32) for i in indices]
200
+ if self.edit_dist:
201
+ batch_d = [np.array([self.dist_list[i]]).astype(np.float32) for i in indices]
202
+
203
+ # padding and masking
204
+ pad_batch_x = pad_sequences(np.array(batch_x), maxlen=self.maxlen_a, value=0.0, padding='post', dtype=batch_x[0].dtype)
205
+ if self.features == 'both':
206
+ pad_batch_p = pad_sequences(np.array(batch_p), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_p[0].dtype)
207
+ pad_batch_e = pad_sequences(np.array(batch_e), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_e[0].dtype)
208
+ else:
209
+ pad_batch_y = pad_sequences(np.array(batch_y), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_y[0].dtype)
210
+ pad_batch_z = pad_sequences(np.array(batch_z), value=0.0, padding='post', dtype=batch_z[0].dtype)
211
+ pad_batch_l = pad_sequences(np.array(batch_l), maxlen=self.maxlen_l, value=0.0, padding='post', dtype=batch_l[0].dtype)
212
+ pad_batch_t = pad_sequences(np.array(batch_t), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_t[0].dtype)
213
+ if self.edit_dist:
214
+ pad_batch_d = pad_sequences(np.array(batch_d), value=0.0, padding='post', dtype=batch_d[0].dtype)
215
+
216
+ # Noisy option
217
+ if self.train:
218
+ batch_x_noisy = [self._mixing_snr(x) for x in batch_x]
219
+ pad_batch_x_noisy = pad_sequences(np.array(batch_x_noisy), maxlen=self.maxlen_a, value=0.0, padding='post', dtype=batch_x_noisy[0].dtype)
220
+
221
+ if self.train:
222
+ if self.features == 'both':
223
+ return pad_batch_x, pad_batch_x_noisy, pad_batch_p, pad_batch_e, pad_batch_z, pad_batch_l, pad_batch_t
224
+ else:
225
+ return pad_batch_x, pad_batch_x_noisy, pad_batch_y, pad_batch_z, pad_batch_l, pad_batch_t
226
+ else:
227
+ if self.features == 'both':
228
+ if self.edit_dist:
229
+ return pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z, pad_batch_d
230
+ else:
231
+ return pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z
232
+ else:
233
+ if self.edit_dist:
234
+ return pad_batch_x, pad_batch_y, pad_batch_z, pad_batch_d
235
+ else:
236
+ return pad_batch_x, pad_batch_y, pad_batch_z
237
+
238
+ def on_epoch_end(self):
239
+ self.indices = np.arange(self.len)
240
+ if self.shuffle == True:
241
+ np.random.shuffle(self.indices)
242
+
243
+ def convert_sequence_to_dataset(dataloader):
244
+ def data_generator():
245
+ for i in range(dataloader.__len__()):
246
+ if dataloader.train:
247
+ if dataloader.features == 'both':
248
+ pad_batch_x, pad_batch_x_noisy, pad_batch_p, pad_batch_e, pad_batch_z, pad_batch_l, pad_batch_t = dataloader[i]
249
+ yield pad_batch_x, pad_batch_x_noisy, pad_batch_p, pad_batch_e, pad_batch_z, pad_batch_l, pad_batch_t
250
+ else:
251
+ pad_batch_x, pad_batch_x_noisy, pad_batch_y, pad_batch_z, pad_batch_l, pad_batch_t = dataloader[i]
252
+ yield pad_batch_x, pad_batch_x_noisy, pad_batch_y, pad_batch_z, pad_batch_l, pad_batch_t
253
+ else:
254
+ if dataloader.features == 'both':
255
+ if dataloader.edit_dist:
256
+ pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z, pad_batch_d = dataloader[i]
257
+ yield pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z, pad_batch_d
258
+ else:
259
+ pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z = dataloader[i]
260
+ yield pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z
261
+ else:
262
+ if dataloader.edit_dist:
263
+ pad_batch_x, pad_batch_y, pad_batch_z, pad_batch_d = dataloader[i]
264
+ yield pad_batch_x, pad_batch_y, pad_batch_z, pad_batch_d
265
+ else:
266
+ pad_batch_x, pad_batch_y, pad_batch_z = dataloader[i]
267
+ yield pad_batch_x, pad_batch_y, pad_batch_z
268
+
269
+ if dataloader.train:
270
+ if dataloader.features == 'both':
271
+ data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
272
+ tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
273
+ tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
274
+ tf.TensorSpec(shape=(None, dataloader.maxlen_t), dtype=tf.int32),
275
+ tf.TensorSpec(shape=(None, dataloader.maxlen_t, 256), dtype=tf.float32),
276
+ tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
277
+ tf.TensorSpec(shape=(None, dataloader.maxlen_l), dtype=tf.int32),
278
+ tf.TensorSpec(shape=(None, dataloader.maxlen_t), dtype=tf.int32),)
279
+ )
280
+ else:
281
+ data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
282
+ tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
283
+ tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
284
+ tf.TensorSpec(shape=(None, dataloader.maxlen_t) if dataloader.features == 'phoneme' else (None, dataloader.maxlen_t, 256),
285
+ dtype=tf.int32 if dataloader.features == 'phoneme' else tf.float32),
286
+ tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
287
+ tf.TensorSpec(shape=(None, dataloader.maxlen_l), dtype=tf.int32),
288
+ tf.TensorSpec(shape=(None, dataloader.maxlen_t), dtype=tf.int32),)
289
+ )
290
+ else:
291
+ if dataloader.features == 'both':
292
+ if dataloader.edit_dist:
293
+ data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
294
+ tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
295
+ tf.TensorSpec(shape=(None, dataloader.maxlen_t), dtype=tf.int32),
296
+ tf.TensorSpec(shape=(None, dataloader.maxlen_t, 256), dtype=tf.float32),
297
+ tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
298
+ tf.TensorSpec(shape=(None, 1), dtype=tf.float32),)
299
+ )
300
+ else:
301
+ data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
302
+ tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
303
+ tf.TensorSpec(shape=(None, dataloader.maxlen_t), dtype=tf.int32),
304
+ tf.TensorSpec(shape=(None, dataloader.maxlen_t, 256), dtype=tf.float32),
305
+ tf.TensorSpec(shape=(None, 1), dtype=tf.float32),)
306
+ )
307
+ else:
308
+ if dataloader.edit_dist:
309
+ data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
310
+ tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
311
+ tf.TensorSpec(shape=(None, dataloader.maxlen_t) if dataloader.features == 'phoneme' else (None, dataloader.maxlen_t, 256),
312
+ dtype=tf.int32 if dataloader.features == 'phoneme' else tf.float32),
313
+ tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
314
+ tf.TensorSpec(shape=(None, 1), dtype=tf.float32),)
315
+ )
316
+ else:
317
+ data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
318
+ tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
319
+ tf.TensorSpec(shape=(None, dataloader.maxlen_t) if dataloader.features == 'phoneme' else (None, dataloader.maxlen_t, 256),
320
+ dtype=tf.int32 if dataloader.features == 'phoneme' else tf.float32),
321
+ tf.TensorSpec(shape=(None, 1), dtype=tf.float32),)
322
+ )
323
+ # data_dataset = data_dataset.cache()
324
+ data_dataset = data_dataset.prefetch(1)
325
+
326
+ return data_dataset
327
+
328
+ if __name__ == '__main__':
329
+ GLOBAL_BATCH_SIZE = 2048
330
+ train_dataset = LibriPhraseDataloader(batch_size=GLOBAL_BATCH_SIZE, train=True, types='both', shuffle=True, pkl='/home/DB/LibriPhrase/data/train_both.pkl', features='g2p_embed')
331
+ test_dataset = LibriPhraseDataloader(batch_size=GLOBAL_BATCH_SIZE, train=False, edit_dist=True, types='both', shuffle=False, pkl='/home/DB/LibriPhrase/data/test_both.pkl', features='g2p_embed')
dataset/libriphrase_ctc1.py ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math, os, re, sys
2
+ from pathlib import Path
3
+ import numpy as np
4
+ import pandas as pd
5
+ import Levenshtein
6
+ from multiprocessing import Pool
7
+ from scipy.io import wavfile
8
+ import tensorflow as tf
9
+
10
+ from tensorflow.keras.utils import Sequence, OrderedEnqueuer
11
+ from tensorflow.keras import layers
12
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
13
+
14
+ sys.path.append(os.path.dirname(__file__))
15
+ from g2p.g2p_en.g2p import G2p
16
+
17
+ import warnings
18
+ warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
19
+ np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
20
+
21
+ class LibriPhraseDataloader(Sequence):
22
+ def __init__(self,
23
+ batch_size,
24
+ fs = 16000,
25
+ wav_dir='/share/nas165/yiting/LibriPhrase/LibriPhrase_data',
26
+ noise_dir='/share/nas165/yiting/EEND/corpora/JHU/musan/musan/noise/sound-bible',
27
+ csv_dir='/share/nas165/yiting/LibriPhrase/data',
28
+ train_csv = ['train100h','train_360h'],
29
+ test_csv = ['train_500h',],
30
+ types='both', # easy, hard
31
+ features='g2p_embed', # phoneme, g2p_embed, both ...
32
+ train=True,
33
+ shuffle=True,
34
+ pkl=None,
35
+ edit_dist=False,
36
+ ):
37
+
38
+ phonemes = ["<pad>", ] + ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
39
+ 'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH',
40
+ 'D', 'DH', 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1',
41
+ 'EY2', 'F', 'G', 'HH', 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2',
42
+ 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0',
43
+ 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH0', 'UH1',
44
+ 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH',
45
+ ' ']
46
+
47
+ self.p2idx = {p: idx for idx, p in enumerate(phonemes)}
48
+ self.idx2p = {idx: p for idx, p in enumerate(phonemes)}
49
+
50
+ self.batch_size = batch_size
51
+ self.fs = fs
52
+ self.wav_dir = wav_dir
53
+ self.csv_dir = csv_dir
54
+ self.noise_dir = noise_dir
55
+ self.train_csv = train_csv
56
+ self.test_csv = test_csv
57
+ self.types = types
58
+ self.features = features
59
+ self.train = train
60
+ self.shuffle = shuffle
61
+ self.pkl = pkl
62
+ self.edit_dist = edit_dist
63
+ self.nPhoneme = len(phonemes)
64
+ self.g2p = G2p()
65
+
66
+ self.__prep__()
67
+ self.on_epoch_end()
68
+
69
+ def __prep__(self):
70
+ if self.train:
71
+ print(">> Preparing noise DB")
72
+ noise_list = [str(x) for x in Path(self.noise_dir).rglob('*.wav')]
73
+ self.noise = np.array([])
74
+ for noise in noise_list:
75
+ fs, data = wavfile.read(noise)
76
+ assert fs == self.fs, ">> Error : Un-match sampling freq.\n{} -> {}".format(noise, fs)
77
+ data = data.astype(np.float32) / 32768.0
78
+ data = (data / np.max(data)) * 0.5
79
+ self.noise = np.append(self.noise, data)
80
+
81
+ self.data = pd.DataFrame(columns=['wav_label', 'wav', 'text', 'duration', 'label', 'type'])
82
+ def process_text(self, x):
83
+ if isinstance(x, str):
84
+ # Only apply re.sub if x is a string
85
+ return re.sub(r"[^a-zA-Z0-9]+", ' ', x)
86
+ else:
87
+ # Handle other cases, e.g., return x as is or convert to string
88
+ return str(x)
89
+ if (self.pkl is not None) and (os.path.isfile(self.pkl)):
90
+ print(">> Load dataset from {}".format(self.pkl))
91
+ self.data = pd.read_pickle(self.pkl)
92
+ else:
93
+ for db in self.train_csv if self.train else self.test_csv:
94
+ csv_list = [str(x) for x in Path(self.csv_dir).rglob('*' + db + '*word*')]
95
+ for n_word in csv_list:
96
+ print(">> processing : {} ".format(n_word))
97
+ df = pd.read_csv(n_word)
98
+ # Split train dataset to match & unmatch case
99
+ anc_pos = df[['anchor_text', 'anchor', 'anchor_text', 'anchor_dur']]
100
+ anc_neg = df[['anchor_text', 'anchor', 'comparison_text', 'anchor_dur', 'target', 'type']]
101
+ com_pos = df[['comparison_text', 'comparison', 'comparison_text', 'comparison_dur']]
102
+ com_neg = df[['comparison_text', 'comparison', 'anchor_text', 'comparison_dur', 'target', 'type']]
103
+ anc_pos.columns = ['wav_label', 'anchor', 'anchor_text', 'anchor_dur']
104
+ com_pos.columns = ['wav_label', 'comparison', 'comparison_text', 'comparison_dur']
105
+ anc_pos['label'] = 1
106
+ anc_pos['type'] = df['type']
107
+ com_pos['label'] = 1
108
+ com_pos['type'] = df['type']
109
+ # Concat
110
+ self.data = self.data.append(anc_pos.rename(columns={y: x for x, y in zip(self.data.columns, anc_pos.columns)}), ignore_index=True)
111
+ self.data = self.data.append(anc_neg.rename(columns={y: x for x, y in zip(self.data.columns, anc_neg.columns)}), ignore_index=True)
112
+ self.data = self.data.append(com_pos.rename(columns={y: x for x, y in zip(self.data.columns, com_pos.columns)}), ignore_index=True)
113
+ self.data = self.data.append(com_neg.rename(columns={y: x for x, y in zip(self.data.columns, com_neg.columns)}), ignore_index=True)
114
+
115
+ # Append wav directory path
116
+ self.data['wav'] = self.data['wav'].apply(lambda x: os.path.join(self.wav_dir, x))
117
+ # g2p & p2idx by g2p_en package
118
+ print(">> Convert word to phoneme")
119
+ self.data['phoneme'] = self.data['text'].apply(lambda x: self.g2p(re.sub(r"[^a-zA-Z0-9]+", ' ', x)))
120
+ print(">> Convert speech word to phoneme")
121
+ self.data['wav_phoneme'] = self.data['wav_label'].apply(lambda x: self.g2p(re.sub(r"[^a-zA-Z0-9]+", ' ', x)))
122
+ print(">> Convert phoneme to index")
123
+ self.data['pIndex'] = self.data['phoneme'].apply(lambda x: [self.p2idx[t] for t in x])
124
+ print(">> Convert speech phoneme to index")
125
+ self.data['wav_pIndex'] = self.data['wav_phoneme'].apply(lambda x: [self.p2idx[t] for t in x])
126
+ print(">> Compute phoneme embedding")
127
+ self.data['g2p_embed'] = self.data['text'].apply(lambda x: self.g2p.embedding(x))
128
+
129
+ print('wav_label',self.data['wav_label'])
130
+ print('text',self.data['text'])
131
+
132
+ self.data['dist'] = self.data.apply(lambda x: Levenshtein.ratio(re.sub(r"[^a-zA-Z0-9]+", ' ', x['wav_label']), re.sub(r"[^a-zA-Z0-9]+", ' ', x['text'])), axis=1)
133
+
134
+ #備註解掉的地方
135
+ if (self.pkl is not None) and (not os.path.isfile(self.pkl)):
136
+ self.data.to_pickle(self.pkl)
137
+
138
+ # Masking dataset type
139
+ if self.types == 'both':
140
+ pass
141
+ elif self.types == 'easy':
142
+ self.data = self.data.loc[self.data['type'] == 'diffspk_easyneg']
143
+ elif self.types == 'hard':
144
+ self.data = self.data.loc[self.data['type'] == 'diffspk_hardneg']
145
+
146
+ # Get longest data
147
+ self.data = self.data.sort_values(by='duration').reset_index(drop=True)
148
+ self.wav_list = self.data['wav'].values
149
+ self.idx_list = self.data['pIndex'].values
150
+ self.sIdx_list = self.data['wav_pIndex'].values
151
+ self.idx_list = [np.insert(lst, 0, 0) for lst in self.idx_list]
152
+ self.sIdx_list = [np.insert(lst, 0, 0) for lst in self.sIdx_list]
153
+ self.emb_list = self.data['g2p_embed'].values
154
+ self.lab_list = self.data['label'].values
155
+ if self.edit_dist:
156
+ self.dist_list = self.data['dist'].values
157
+
158
+ # Set dataloader params.
159
+ self.len = len(self.data)
160
+ self.maxlen_t = int((int(self.data['text'].apply(lambda x: len(x)).max() / 10) + 1) * 10)
161
+ self.maxlen_a = int((int(self.data['duration'].values[-1] / 0.5) + 1 ) * self.fs / 2)
162
+ self.maxlen_l = int((int(self.data['wav_label'].apply(lambda x: len(x)).max() / 10) + 1) * 10)
163
+
164
+ def __len__(self):
165
+ # return total batch-wise length
166
+ return math.ceil(self.len / self.batch_size)
167
+
168
+ def _load_wav(self, wav):
169
+ return np.array(wavfile.read(wav)[1]).astype(np.float32) / 32768.0
170
+
171
+ def _mixing_snr(self, clean, snr=[5, 15]):
172
+ def _cal_adjusted_rms(clean_rms, snr):
173
+ a = float(snr) / 20
174
+ noise_rms = clean_rms / (10**a)
175
+ return noise_rms
176
+
177
+ def _cal_rms(amp):
178
+ return np.sqrt(np.mean(np.square(amp), axis=-1))
179
+
180
+ start = np.random.randint(0, len(self.noise)-len(clean))
181
+ divided_noise = self.noise[start: start + len(clean)]
182
+ clean_rms = _cal_rms(clean)
183
+ noise_rms = _cal_rms(divided_noise)
184
+ adj_noise_rms = _cal_adjusted_rms(clean_rms, np.random.randint(snr[0], snr[1]))
185
+
186
+ adj_noise_amp = divided_noise * (adj_noise_rms / (noise_rms + 1e-7))
187
+ noisy = clean + adj_noise_amp
188
+
189
+ if np.max(noisy) > 1:
190
+ noisy = noisy / np.max(noisy)
191
+
192
+ return noisy
193
+
194
+ def __getitem__(self, idx):
195
+ # chunking
196
+ indices = self.indices[idx * self.batch_size : (idx + 1) * self.batch_size]
197
+
198
+ # load inputs
199
+ batch_x = [np.array(wavfile.read(self.wav_list[i])[1]).astype(np.float32) / 32768.0 for i in indices]
200
+ if self.features == 'both':
201
+ batch_p = [np.array(self.idx_list[i]).astype(np.int32) for i in indices]
202
+ batch_e = [np.array(self.emb_list[i]).astype(np.float32) for i in indices]
203
+ else:
204
+ if self.features == 'phoneme':
205
+ batch_y = [np.array(self.idx_list[i]).astype(np.int32) for i in indices]
206
+ elif self.features == 'g2p_embed':
207
+ batch_y = [np.array(self.emb_list[i]).astype(np.float32) for i in indices]
208
+ # load outputs
209
+ batch_z = [np.array([self.lab_list[i]]).astype(np.float32) for i in indices]
210
+ batch_l = [np.array(self.sIdx_list[i]).astype(np.int32) for i in indices]
211
+ batch_t = [np.array(self.idx_list[i]).astype(np.int32) for i in indices]
212
+ if self.edit_dist:
213
+ batch_d = [np.array([self.dist_list[i]]).astype(np.float32) for i in indices]
214
+
215
+ # padding and masking
216
+ pad_batch_x = pad_sequences(np.array(batch_x), maxlen=self.maxlen_a, value=0.0, padding='post', dtype=batch_x[0].dtype)
217
+ if self.features == 'both':
218
+ pad_batch_p = pad_sequences(np.array(batch_p), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_p[0].dtype)
219
+ pad_batch_e = pad_sequences(np.array(batch_e), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_e[0].dtype)
220
+ else:
221
+ pad_batch_y = pad_sequences(np.array(batch_y), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_y[0].dtype)
222
+ pad_batch_z = pad_sequences(np.array(batch_z), value=0.0, padding='post', dtype=batch_z[0].dtype)
223
+ pad_batch_l = pad_sequences(np.array(batch_l), maxlen=self.maxlen_l, value=0.0, padding='post', dtype=batch_l[0].dtype)
224
+ pad_batch_t = pad_sequences(np.array(batch_t), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_t[0].dtype)
225
+
226
+ if self.edit_dist:
227
+ pad_batch_d = pad_sequences(np.array(batch_d), value=0.0, padding='post', dtype=batch_d[0].dtype)
228
+
229
+ # Noisy option
230
+ if self.train:
231
+ batch_x_noisy = [self._mixing_snr(x) for x in batch_x]
232
+ pad_batch_x_noisy = pad_sequences(np.array(batch_x_noisy), maxlen=self.maxlen_a, value=0.0, padding='post', dtype=batch_x_noisy[0].dtype)
233
+
234
+ if self.train:
235
+ if self.features == 'both':
236
+ return pad_batch_x, pad_batch_x_noisy, pad_batch_p, pad_batch_e, pad_batch_z, pad_batch_l, pad_batch_t
237
+ else:
238
+ return pad_batch_x, pad_batch_x_noisy, pad_batch_y, pad_batch_z, pad_batch_l, pad_batch_t
239
+ else:
240
+ if self.features == 'both':
241
+ if self.edit_dist:
242
+ return pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z, pad_batch_d
243
+ else:
244
+ return pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z
245
+ else:
246
+ if self.edit_dist:
247
+ return pad_batch_x, pad_batch_y, pad_batch_z, pad_batch_d
248
+ else:
249
+ return pad_batch_x, pad_batch_y, pad_batch_z
250
+
251
+ def on_epoch_end(self):
252
+ self.indices = np.arange(self.len)
253
+ if self.shuffle == True:
254
+ np.random.shuffle(self.indices)
255
+
256
+ def convert_sequence_to_dataset(dataloader):
257
+ def data_generator():
258
+ for i in range(dataloader.__len__()):
259
+ if dataloader.train:
260
+ if dataloader.features == 'both':
261
+ pad_batch_x, pad_batch_x_noisy, pad_batch_p, pad_batch_e, pad_batch_z, pad_batch_l, pad_batch_t = dataloader[i]
262
+ yield pad_batch_x, pad_batch_x_noisy, pad_batch_p, pad_batch_e, pad_batch_z, pad_batch_l, pad_batch_t
263
+ else:
264
+ pad_batch_x, pad_batch_x_noisy, pad_batch_y, pad_batch_z, pad_batch_l, pad_batch_t = dataloader[i]
265
+ yield pad_batch_x, pad_batch_x_noisy, pad_batch_y, pad_batch_z, pad_batch_l, pad_batch_t
266
+ else:
267
+ if dataloader.features == 'both':
268
+ if dataloader.edit_dist:
269
+ pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z, pad_batch_d = dataloader[i]
270
+ yield pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z, pad_batch_d
271
+ else:
272
+ pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z = dataloader[i]
273
+ yield pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z
274
+ else:
275
+ if dataloader.edit_dist:
276
+ pad_batch_x, pad_batch_y, pad_batch_z, pad_batch_d = dataloader[i]
277
+ yield pad_batch_x, pad_batch_y, pad_batch_z, pad_batch_d
278
+ else:
279
+ pad_batch_x, pad_batch_y, pad_batch_z = dataloader[i]
280
+ yield pad_batch_x, pad_batch_y, pad_batch_z
281
+
282
+ if dataloader.train:
283
+ if dataloader.features == 'both':
284
+ data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
285
+ tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
286
+ tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
287
+ tf.TensorSpec(shape=(None, dataloader.maxlen_t), dtype=tf.int32),
288
+ tf.TensorSpec(shape=(None, dataloader.maxlen_t, 256), dtype=tf.float32),
289
+ tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
290
+ tf.TensorSpec(shape=(None, dataloader.maxlen_l), dtype=tf.int32),
291
+ tf.TensorSpec(shape=(None, dataloader.maxlen_t), dtype=tf.int32),)
292
+ )
293
+ else:
294
+ data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
295
+ tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
296
+ tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
297
+ tf.TensorSpec(shape=(None, dataloader.maxlen_t) if dataloader.features == 'phoneme' else (None, dataloader.maxlen_t, 256),
298
+ dtype=tf.int32 if dataloader.features == 'phoneme' else tf.float32),
299
+ tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
300
+ tf.TensorSpec(shape=(None, dataloader.maxlen_l), dtype=tf.int32),
301
+ tf.TensorSpec(shape=(None, dataloader.maxlen_t), dtype=tf.int32),)
302
+ )
303
+ else:
304
+ if dataloader.features == 'both':
305
+ if dataloader.edit_dist:
306
+ data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
307
+ tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
308
+ tf.TensorSpec(shape=(None, dataloader.maxlen_t), dtype=tf.int32),
309
+ tf.TensorSpec(shape=(None, dataloader.maxlen_t, 256), dtype=tf.float32),
310
+ tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
311
+ tf.TensorSpec(shape=(None, 1), dtype=tf.float32),)
312
+ )
313
+ else:
314
+ data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
315
+ tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
316
+ tf.TensorSpec(shape=(None, dataloader.maxlen_t), dtype=tf.int32),
317
+ tf.TensorSpec(shape=(None, dataloader.maxlen_t, 256), dtype=tf.float32),
318
+ tf.TensorSpec(shape=(None, 1), dtype=tf.float32),)
319
+ )
320
+ else:
321
+ if dataloader.edit_dist:
322
+ data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
323
+ tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
324
+ tf.TensorSpec(shape=(None, dataloader.maxlen_t) if dataloader.features == 'phoneme' else (None, dataloader.maxlen_t, 256),
325
+ dtype=tf.int32 if dataloader.features == 'phoneme' else tf.float32),
326
+ tf.TensorSpec(shape=(None, 1), dtype=tf.float32),
327
+ tf.TensorSpec(shape=(None, 1), dtype=tf.float32),)
328
+ )
329
+ else:
330
+ data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
331
+ tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
332
+ tf.TensorSpec(shape=(None, dataloader.maxlen_t) if dataloader.features == 'phoneme' else (None, dataloader.maxlen_t, 256),
333
+ dtype=tf.int32 if dataloader.features == 'phoneme' else tf.float32),
334
+ tf.TensorSpec(shape=(None, 1), dtype=tf.float32),)
335
+ )
336
+ # data_dataset = data_dataset.cache()
337
+ data_dataset = data_dataset.prefetch(1)
338
+
339
+ return data_dataset
340
+
341
+ if __name__ == '__main__':
342
+ GLOBAL_BATCH_SIZE = 2048
343
+ train_dataset = LibriPhraseDataloader(batch_size=GLOBAL_BATCH_SIZE, train=True, types='both', shuffle=True, features='g2p_embed')
344
+ test_dataset = LibriPhraseDataloader(batch_size=GLOBAL_BATCH_SIZE, train=False, edit_dist=True, types='both', shuffle=False, features='g2p_embed')
345
+ train_dataset = LibriPhraseDataloader(batch_size=GLOBAL_BATCH_SIZE, train=True, types='both', shuffle=True, pkl='/share/nas165/yiting/PhonMatchNet/data/train_both.pkl', features='g2p_embed')
346
+ test_dataset = LibriPhraseDataloader(batch_size=GLOBAL_BATCH_SIZE, train=False, edit_dist=True, types='both', shuffle=False, pkl='/share/nas165/yiting/PhonMatchNet/data/test_both.pkl', features='g2p_embed')
dataset/qualcomm.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math, os, re, sys
2
+ from pathlib import Path
3
+ import numpy as np
4
+ import pandas as pd
5
+ from multiprocessing import Pool
6
+ from scipy.io import wavfile
7
+ import tensorflow as tf
8
+
9
+ from tensorflow.keras.utils import Sequence, OrderedEnqueuer
10
+ from tensorflow.keras import layers
11
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
12
+
13
+ sys.path.append(os.path.dirname(__file__))
14
+ from g2p.g2p_en.g2p import G2p
15
+
16
+ import warnings
17
+ warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
18
+ np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
19
+
20
+ class QualcommKeywordSpeechDataloader(Sequence):
21
+ def __init__(self,
22
+ batch_size,
23
+ fs = 16000,
24
+ wav_dir='/home/DB/qualcomm_keyword_speech_dataset',
25
+ target_list=['hey_android', 'hey_snapdragon', 'hi_galaxy', 'hi_lumina'],
26
+ features='g2p_embed', # phoneme, g2p_embed, both ...
27
+ shuffle=True,
28
+ pkl=None,
29
+ ):
30
+
31
+ phonemes = ["<pad>", ] + ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
32
+ 'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH',
33
+ 'D', 'DH', 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1',
34
+ 'EY2', 'F', 'G', 'HH', 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2',
35
+ 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0',
36
+ 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH0', 'UH1',
37
+ 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH',
38
+ ' ']
39
+
40
+ self.p2idx = {p: idx for idx, p in enumerate(phonemes)}
41
+ self.idx2p = {idx: p for idx, p in enumerate(phonemes)}
42
+
43
+ self.batch_size = batch_size
44
+ self.fs = fs
45
+ self.wav_dir = wav_dir
46
+ self.target_list = target_list
47
+ self.features = features
48
+ self.shuffle = shuffle
49
+ self.pkl = pkl
50
+ self.nPhoneme = len(phonemes)
51
+ self.g2p = G2p()
52
+
53
+ self.__prep__()
54
+ self.on_epoch_end()
55
+
56
+ def __prep__(self):
57
+ self.data = pd.DataFrame(columns=['wav', 'text', 'duration', 'label'])
58
+
59
+ if (self.pkl is not None) and (os.path.isfile(self.pkl)):
60
+ print(">> Load dataset from {}".format(self.pkl))
61
+ self.data = pd.read_pickle(self.pkl)
62
+ else:
63
+ print(">> Make dataset from {}".format(self.wav_dir))
64
+ target_dict = {}
65
+ idx = 0
66
+ for target in self.target_list:
67
+ print(">> Extract from {}".format(target))
68
+ wav_list = [str(x) for x in Path(os.path.join(self.wav_dir, target)).rglob('*.wav')]
69
+ for wav in wav_list:
70
+ anchor_text = wav.split('/')[-3].lower().replace('_', ' ')
71
+ duration = float(wavfile.read(wav)[1].shape[-1])/self.fs
72
+ for comparison_text in self.target_list:
73
+ comparison_text = comparison_text.replace('_', ' ')
74
+ label = 1 if anchor_text == comparison_text else 0
75
+ target_dict[idx] = {
76
+ 'wav': wav,
77
+ 'text': comparison_text,
78
+ 'duration': duration,
79
+ 'label': label
80
+ }
81
+ idx += 1
82
+ self.data = self.data.append(pd.DataFrame.from_dict(target_dict, 'index'), ignore_index=True)
83
+
84
+ # g2p & p2idx by g2p_en package
85
+ print(">> Convert word to phoneme")
86
+ self.data['phoneme'] = self.data['text'].apply(lambda x: self.g2p(re.sub(r"[^a-zA-Z0-9]+", ' ', x)))
87
+ print(">> Convert phoneme to index")
88
+ self.data['pIndex'] = self.data['phoneme'].apply(lambda x: [self.p2idx[t] for t in x])
89
+ print(">> Compute phoneme embedding")
90
+ self.data['g2p_embed'] = self.data['text'].apply(lambda x: self.g2p.embedding(x))
91
+
92
+ if (self.pkl is not None) and (not os.path.isfile(self.pkl)):
93
+ self.data.to_pickle(self.pkl)
94
+
95
+ # Get longest data
96
+ self.data = self.data.sort_values(by='duration').reset_index(drop=True)
97
+ self.wav_list = self.data['wav'].values
98
+ self.idx_list = self.data['pIndex'].values
99
+ self.emb_list = self.data['g2p_embed'].values
100
+ self.lab_list = self.data['label'].values
101
+
102
+ # Set dataloader params.
103
+ self.len = len(self.data)
104
+ self.maxlen_t = int((int(self.data['text'].apply(lambda x: len(x)).max() / 10) + 1) * 10)
105
+ self.maxlen_a = int((int(self.data['duration'].values[-1] / 0.5) + 1 ) * self.fs / 2)
106
+
107
+ def __len__(self):
108
+ # return total batch-wise length
109
+ return math.ceil(self.len / self.batch_size)
110
+
111
+ def _load_wav(self, wav):
112
+ return np.array(wavfile.read(wav)[1]).astype(np.float32) / 32768.0
113
+
114
+ def __getitem__(self, idx):
115
+ # chunking
116
+ indices = self.indices[idx * self.batch_size : (idx + 1) * self.batch_size]
117
+
118
+ # load inputs
119
+ batch_x = [np.array(wavfile.read(self.wav_list[i])[1]).astype(np.float32) / 32768.0 for i in indices]
120
+ if self.features == 'both':
121
+ batch_p = [np.array(self.idx_list[i]).astype(np.int32) for i in indices]
122
+ batch_e = [np.array(self.emb_list[i]).astype(np.float32) for i in indices]
123
+ else:
124
+ if self.features == 'phoneme':
125
+ batch_y = [np.array(self.idx_list[i]).astype(np.int32) for i in indices]
126
+ elif self.features == 'g2p_embed':
127
+ batch_y = [np.array(self.emb_list[i]).astype(np.float32) for i in indices]
128
+ # load outputs
129
+ batch_z = [np.array([self.lab_list[i]]).astype(np.float32) for i in indices]
130
+
131
+ # padding and masking
132
+ pad_batch_x = pad_sequences(np.array(batch_x), maxlen=self.maxlen_a, value=0.0, padding='post', dtype=batch_x[0].dtype)
133
+ if self.features == 'both':
134
+ pad_batch_p = pad_sequences(np.array(batch_p), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_p[0].dtype)
135
+ pad_batch_e = pad_sequences(np.array(batch_e), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_e[0].dtype)
136
+ else:
137
+ pad_batch_y = pad_sequences(np.array(batch_y), maxlen=self.maxlen_t, value=0.0, padding='post', dtype=batch_y[0].dtype)
138
+ pad_batch_z = pad_sequences(np.array(batch_z), value=0.0, padding='post', dtype=batch_z[0].dtype)
139
+
140
+ if self.features == 'both':
141
+ return pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z
142
+ else:
143
+ return pad_batch_x, pad_batch_y, pad_batch_z
144
+
145
+ def on_epoch_end(self):
146
+ self.indices = np.arange(self.len)
147
+ if self.shuffle == True:
148
+ np.random.shuffle(self.indices)
149
+
150
+ def convert_sequence_to_dataset(dataloader):
151
+ def data_generator():
152
+ for i in range(dataloader.__len__()):
153
+ if dataloader.features == 'both':
154
+ pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z = dataloader[i]
155
+ yield pad_batch_x, pad_batch_p, pad_batch_e, pad_batch_z
156
+ else:
157
+ pad_batch_x, pad_batch_y, pad_batch_z = dataloader[i]
158
+ yield pad_batch_x, pad_batch_y, pad_batch_z
159
+
160
+ if dataloader.features == 'both':
161
+ data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
162
+ tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
163
+ tf.TensorSpec(shape=(None, dataloader.maxlen_t), dtype=tf.int32),
164
+ tf.TensorSpec(shape=(None, dataloader.maxlen_t, 256), dtype=tf.float32),
165
+ tf.TensorSpec(shape=(None, 1), dtype=tf.float32),)
166
+ )
167
+ else:
168
+ data_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
169
+ tf.TensorSpec(shape=(None, dataloader.maxlen_a), dtype=tf.float32),
170
+ tf.TensorSpec(shape=(None, dataloader.maxlen_t) if dataloader.features == 'phoneme' else (None, dataloader.maxlen_t, 256),
171
+ dtype=tf.int32 if dataloader.features == 'phoneme' else tf.float32),
172
+ tf.TensorSpec(shape=(None, 1), dtype=tf.float32),)
173
+ )
174
+ # data_dataset = data_dataset.cache()
175
+ data_dataset = data_dataset.prefetch(1)
176
+
177
+ return data_dataset
178
+
179
+ if __name__ == '__main__':
180
+ dataloader = QualcommKeywordSpeechDataloader(2048, pkl='/home/DB/qualcomm_keyword_speech_dataset/qualcomm.pkl', features='g2p_embed')
demo.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, warnings, argparse
2
+ import tensorflow as tf
3
+ import numpy as np
4
+ from model import ukws
5
+ from dataset import dataloader_demo
6
+ import gradio as gr
7
+ # import librosa
8
+
9
+ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
10
+ tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
11
+ warnings.filterwarnings('ignore')
12
+ warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
13
+ np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
14
+ warnings.simplefilter("ignore")
15
+
16
+ seed = 42
17
+ tf.random.set_seed(seed)
18
+ np.random.seed(seed)
19
+
20
+
21
+ parser = argparse.ArgumentParser()
22
+
23
+ parser.add_argument('--text_input', required=False, type=str, default='g2p_embed')
24
+ parser.add_argument('--audio_input', required=False, type=str, default='both')
25
+ parser.add_argument('--load_checkpoint_path', required=True, type=str)
26
+ parser.add_argument('--keyword_list_length', required=True, type=int)
27
+ parser.add_argument('--stack_extractor', action='store_true')
28
+ parser.add_argument('--comment', required=False, type=str)
29
+ args = parser.parse_args()
30
+
31
+ gpus = tf.config.experimental.list_physical_devices('GPU')
32
+ if gpus:
33
+ try:
34
+ for gpu in gpus:
35
+ tf.config.experimental.set_memory_growth(gpu, True)
36
+ except RuntimeError as e:
37
+ print(e)
38
+
39
+ strategy = tf.distribute.MirroredStrategy()
40
+ batch_size = args.keyword_list_length
41
+ # Batch size per GPU
42
+ GLOBAL_BATCH_SIZE = batch_size * strategy.num_replicas_in_sync
43
+ # BATCH_SIZE_PER_REPLICA = GLOBAL_BATCH_SIZE / strategy.num_replicas_in_sync
44
+
45
+ # Make Dataloader
46
+ text_input = args.text_input
47
+ audio_input = args.audio_input
48
+ load_checkpoint_path = args.load_checkpoint_path
49
+
50
+ phonemes = ["<pad>", ] + ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
51
+ 'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH',
52
+ 'D', 'DH', 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1',
53
+ 'EY2', 'F', 'G', 'HH', 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2',
54
+ 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0',
55
+ 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH0', 'UH1',
56
+ 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH',
57
+ ' ']
58
+ # Number of phonemes
59
+ vocab = len(phonemes)
60
+
61
+ # Model params.
62
+ kwargs = {
63
+ 'vocab' : vocab,
64
+ 'text_input' : text_input,
65
+ 'audio_input' : audio_input,
66
+ 'frame_length' : 400,
67
+ 'hop_length' : 160,
68
+ 'num_mel' : 40,
69
+ 'sample_rate' : 16000,
70
+ 'log_mel' : False,
71
+ 'stack_extractor' : args.stack_extractor,
72
+ }
73
+
74
+
75
+
76
+ # Make tensorboard dict.
77
+ global keyword
78
+ param = kwargs
79
+ param['comment'] = args.comment
80
+
81
+
82
+ with strategy.scope():
83
+
84
+
85
+ model = ukws.BaseUKWS(**kwargs)
86
+ if args.load_checkpoint_path:
87
+ checkpoint_dir=args.load_checkpoint_path
88
+ checkpoint = tf.train.Checkpoint(model=model)
89
+ checkpoint_manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=5)
90
+ latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
91
+ if latest_checkpoint:
92
+ checkpoint.restore(latest_checkpoint)
93
+ print("Checkpoint restored!")
94
+ else:
95
+ print("No checkpoint found.")
96
+
97
+ def inference(audio,keyword):
98
+
99
+ if isinstance(keyword, str):
100
+ keyword = [kw.strip() for kw in keyword.split(',')]
101
+
102
+ test_google_dataset = dataloader_demo.GoogleCommandsDataloader(batch_size=GLOBAL_BATCH_SIZE, features=text_input, wav_path_or_object=audio, keyword = keyword)
103
+
104
+ test_google_dataset = dataloader_demo.convert_sequence_to_dataset(test_google_dataset)
105
+
106
+ test_google_dist_dataset = strategy.experimental_distribute_dataset(test_google_dataset)
107
+
108
+
109
+ # @tf.function
110
+ def test_step_metric_only(inputs,keyword_list):
111
+ clean_speech = inputs[0]
112
+ text = inputs[1]
113
+ labels = inputs[2]
114
+ prob, affinity_matrix = model(clean_speech, text, training=False)[:2]
115
+ prob=tf.round(prob * 1000) / 1000
116
+ prob = prob.numpy().flatten()
117
+ max_indices = np.argmax(prob,axis=0)
118
+ if prob[max_indices] >= 0.8:
119
+ keyword = keyword_list[ max_indices]
120
+ else :
121
+ keyword = 'no keyword'
122
+
123
+ print('keyword:',keyword_list)
124
+ print('prob',prob)
125
+ msg = ''
126
+ for k, p in zip(keyword_list, prob):
127
+ msg += '{} | {:.2f} \n'.format(k, p)
128
+
129
+ return keyword, msg
130
+
131
+ for x in test_google_dist_dataset:
132
+ keyword, prob = test_step_metric_only(x,keyword)
133
+
134
+
135
+ return keyword, prob
136
+
137
+ # keyword = ['realtek go','ok google','vintage','hackney','crocodile','surroundings','oversaw','northwestern']
138
+ # audio = '/share/nas165/yiting/recording/ok_google/Default_20240725-183000.wav'
139
+ # inference(audio,keyword)
140
+
141
+ demo = gr.Interface(
142
+ fn=inference,
143
+ inputs=[gr.Audio(source="upload", label="Sound"),
144
+ gr.Textbox(placeholder="Keyword List Here...", label="keyword_list")],
145
+ examples=[
146
+ ["./recording/ok_google/ok_google-183000.wav", 'realtek go,ok google,vintage,hackney,crocodile,surroundings,oversaw,northwestern'],
147
+ ["./recording/ok_google/ok_google-183005.wav", 'realtek go,ok google,vintage,hackney,crocodile,surroundings,oversaw,northwestern'],
148
+ ["./recording/ok_google/ok_google-183008.wav", 'realtek go,ok google,vintage,hackney,crocodile,surroundings,oversaw,northwestern'],
149
+ ["./recording/ok_google/ok_google-183011.wav", 'realtek go,ok google,vintage,hackney,crocodile,surroundings,oversaw,northwestern'],
150
+ ["./recording/ok_google/ok_google-183015.wav", 'realtek go,ok google,vintage,hackney,crocodile,surroundings,oversaw,northwestern'],
151
+ ["./recording/realtek_go/realtek_go-183029.wav", 'realtek go,ok google,vintage,hackney,crocodile,surroundings,oversaw,northwestern'],
152
+ ["./recording/realtek_go/realtek_go-183033.wav", 'realtek go,ok google,vintage,hackney,crocodile,surroundings,oversaw,northwestern'],
153
+ ["./recording/realtek_go/realtek_go-183036.wav", 'realtek go,ok google,vintage,hackney,crocodile,surroundings,oversaw,northwestern'],
154
+ ["./recording/realtek_go/realtek_go-183039.wav", 'realtek go,ok google,vintage,hackney,crocodile,surroundings,oversaw,northwestern'],
155
+ ["./recording/realtek_go/realtek_go-183043.wav", 'realtek go,ok google,vintage,hackney,crocodile,surroundings,oversaw,northwestern'],
156
+ ],
157
+ outputs=[gr.Textbox(label="keyword"), gr.Textbox(label="Confidence Score of keyword")],
158
+ )
159
+
160
+ demo.launch(server_name='0.0.0.0', server_port=7860,share=True)
161
+
162
+
163
+
164
+
165
+
166
+
167
+
168
+
docker/Dockerfile ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM tensorflow/tensorflow:2.4.1-gpu
2
+
3
+ # Install dependency
4
+ RUN apt-key adv --keyserver keyserver.ubuntu.com --recv A4B469963BF863CC
5
+ RUN apt-get update -y && apt-get install -y \
6
+ git \
7
+ libsndfile1
8
+
9
+ # Install python packages
10
+ RUN python -m pip install --upgrade pip && pip install \
11
+ levenshtein \
12
+ six \
13
+ audioread \
14
+ librosa \
15
+ PySoundFile \
16
+ scipy \
17
+ tqdm \
18
+ pandas \
19
+ nltk \
20
+ inflect
21
+
22
+ RUN python -m pip uninstall -y numpy
23
+ RUN python -m pip install numpy==1.18.5
24
+
25
+ WORKDIR /home
flagged/Sound/c129aef35ba4cb66620f813cd7268c4be510a66d/ok_google-183000.wav ADDED
Binary file (96.3 kB). View file
 
flagged/Sound/d35a5cf80a9403828bc601a0a761a5f88da06f00/realtek_go-183033.wav ADDED
Binary file (101 kB). View file
 
flagged/log.csv ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ Sound,keyword_list,keyword,Confidence Score of keyword,flag,username,timestamp
2
+ /share/nas165/yiting/CL-KWS_202408_v1/flagged/Sound/c129aef35ba4cb66620f813cd7268c4be510a66d/ok_google-183000.wav,"realtek go,ok google,vintage,hackney,crocodile,surroundings,oversaw,northwestern",,,,,2024-09-11 09:54:49.824521
3
+ /share/nas165/yiting/CL-KWS_202408_v1/flagged/Sound/d35a5cf80a9403828bc601a0a761a5f88da06f00/realtek_go-183033.wav,"realtek go,ok google,vintage,hackney,crocodile,surroundings,oversaw,northwestern",ok google,"ok cortana | 0.11
4
+ ok google | 0.97
5
+ hey google | 0.46
6
+ oh come google | 0.87
7
+ ok gogo | 0.91
8
+ ",,,2024-09-11 10:23:11.972172
inference.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys, os, datetime, warnings, argparse
2
+ import tensorflow as tf
3
+ import numpy as np
4
+
5
+ from model import ukws
6
+ from dataset import google_infe202405
7
+
8
+
9
+ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
10
+ tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
11
+ warnings.filterwarnings('ignore')
12
+ warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
13
+ np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
14
+ warnings.simplefilter("ignore")
15
+
16
+ seed = 42
17
+ tf.random.set_seed(seed)
18
+ np.random.seed(seed)
19
+
20
+
21
+ parser = argparse.ArgumentParser()
22
+
23
+ parser.add_argument('--text_input', required=False, type=str, default='g2p_embed')
24
+ parser.add_argument('--audio_input', required=False, type=str, default='both')
25
+ parser.add_argument('--load_checkpoint_path', required=True, type=str)
26
+
27
+ parser.add_argument('--google_pkl', required=False, type=str, default='/home/DB/data/google_test_all.pkl')
28
+ parser.add_argument('--stack_extractor', action='store_true')
29
+ args = parser.parse_args()
30
+
31
+ gpus = tf.config.experimental.list_physical_devices('GPU')
32
+ if gpus:
33
+ try:
34
+ for gpu in gpus:
35
+ tf.config.experimental.set_memory_growth(gpu, True)
36
+ except RuntimeError as e:
37
+ print(e)
38
+
39
+ strategy = tf.distribute.MirroredStrategy()
40
+
41
+ # Batch size per GPU
42
+ GLOBAL_BATCH_SIZE = 1000 * strategy.num_replicas_in_sync
43
+ BATCH_SIZE_PER_REPLICA = GLOBAL_BATCH_SIZE / strategy.num_replicas_in_sync
44
+
45
+ # Make Dataloader
46
+ text_input = args.text_input
47
+ audio_input = args.audio_input
48
+ load_checkpoint_path = args.load_checkpoint_path
49
+
50
+
51
+ test_google_dataset = google_infe202405.GoogleCommandsDataloader(batch_size=GLOBAL_BATCH_SIZE, features=text_input, shuffle=False, pkl=args.google_pkl)
52
+
53
+ test_google_dataset = google_infe202405.convert_sequence_to_dataset(test_google_dataset)
54
+
55
+ test_google_dist_dataset = strategy.experimental_distribute_dataset(test_google_dataset)
56
+
57
+ phonemes = ["<pad>", ] + ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
58
+ 'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH',
59
+ 'D', 'DH', 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1',
60
+ 'EY2', 'F', 'G', 'HH', 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2',
61
+ 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0',
62
+ 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH0', 'UH1',
63
+ 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH',
64
+ ' ']
65
+ # Number of phonemes
66
+ vocab = len(phonemes)
67
+
68
+ # Model params.
69
+ kwargs = {
70
+ 'vocab' : vocab,
71
+ 'text_input' : text_input,
72
+ 'audio_input' : audio_input,
73
+ 'frame_length' : 400,
74
+ 'hop_length' : 160,
75
+ 'num_mel' : 40,
76
+ 'sample_rate' : 16000,
77
+ 'log_mel' : False,
78
+ 'stack_extractor' : args.stack_extractor,
79
+ }
80
+
81
+
82
+ # Make tensorboard dict.
83
+ param = kwargs
84
+
85
+
86
+ with strategy.scope():
87
+
88
+
89
+ model = ukws.BaseUKWS(**kwargs)
90
+
91
+
92
+ if args.load_checkpoint_path:
93
+ checkpoint_dir=args.load_checkpoint_path
94
+ checkpoint = tf.train.Checkpoint(model=model)
95
+ checkpoint_manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=5)
96
+ latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
97
+ if latest_checkpoint:
98
+ checkpoint.restore(latest_checkpoint)
99
+ print("Checkpoint restored!")
100
+
101
+
102
+
103
+ # @tf.function
104
+ def test_step_metric_only(inputs):
105
+
106
+ clean_speech = inputs[0]
107
+ text = inputs[1]
108
+ labels = inputs[2]
109
+
110
+ prob = model(clean_speech, text, training=False)[0]
111
+
112
+ dim1=labels.shape[0]//20
113
+ prob = tf.reshape(prob,[dim1,20])
114
+ labels = tf.reshape(labels,[dim1,20])
115
+ predictions = tf.math.argmax(prob, axis=1)
116
+ actuals = tf.math.argmax(labels, axis=1)
117
+
118
+ true_count = tf.reduce_sum(tf.cast(tf.math.equal(predictions , actuals), tf.float32)).numpy()
119
+ num_testdata = dim1
120
+ return true_count, num_testdata
121
+
122
+
123
+ def distributed_test_step_metric_only(dataset_inputs):
124
+ true_count, num_testdata = strategy.run(test_step_metric_only, args=(dataset_inputs,))
125
+ return true_count, num_testdata
126
+
127
+
128
+ total_true_count = 0
129
+ total_num_testdata = 0
130
+ for x in test_google_dist_dataset:
131
+ true_count, num_testdata = distributed_test_step_metric_only(x)
132
+ total_true_count += true_count
133
+ total_num_testdata += num_testdata
134
+ accuracy = total_true_count / total_num_testdata * 100.0
135
+ print("準確率:", accuracy, "%")
136
+
137
+
138
+
139
+
140
+
141
+
model/__pycache__/discriminator.cpython-37.pyc ADDED
Binary file (2.35 kB). View file
 
model/__pycache__/encoder.cpython-37.pyc ADDED
Binary file (5.6 kB). View file
 
model/__pycache__/extractor.cpython-37.pyc ADDED
Binary file (3.82 kB). View file
 
model/__pycache__/log_melspectrogram.cpython-37.pyc ADDED
Binary file (2.17 kB). View file
 
model/__pycache__/speech_embedding.cpython-37.pyc ADDED
Binary file (1.75 kB). View file