Spaces:
Running
Running
Martijn Bartelds
commited on
Commit
·
575567c
1
Parent(s):
9c307a0
Update app
Browse files- neural_acoustic_distance.py +70 -71
neural_acoustic_distance.py
CHANGED
@@ -27,66 +27,66 @@ model_id = st.selectbox(
|
|
27 |
if model_id == "other":
|
28 |
model_id = st.text_input("Enter the wav2vec 2.0 model you want to use:", value = "facebook/wav2vec2-large-960h", key = "model")
|
29 |
|
30 |
-
|
31 |
-
cfg = AutoConfig.from_pretrained(model_id)
|
32 |
-
layer = st.number_input("Select the layer you want to use:",
|
33 |
-
|
34 |
-
|
35 |
-
def load_wav2vec2_featurizer(model_id: str, layer: Optional[int] = None):
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
featurizer_a = load_wav2vec2_featurizer(model_id, layer)
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
|
91 |
def aligner(x, y) -> Any:
|
92 |
return dtw(x, y, keep_internals=True)
|
@@ -173,16 +173,15 @@ if filename_x is not None and filename_y is not None and featurizer_a is not Non
|
|
173 |
st.pyplot(fig)
|
174 |
|
175 |
if os.path.isfile("./output/plot.pdf"):
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
frames in the pronunciation on the y-axis are aligned to a single frame in the pronunciation on the x-axis.")
|
186 |
|
187 |
with open("./output/plot.pdf", "rb") as file:
|
188 |
btn = st.download_button(
|
@@ -190,4 +189,4 @@ if filename_x is not None and filename_y is not None and featurizer_a is not Non
|
|
190 |
data=file,
|
191 |
file_name="plot.pdf",
|
192 |
mime="image/pdf"
|
193 |
-
)
|
|
|
27 |
if model_id == "other":
|
28 |
model_id = st.text_input("Enter the wav2vec 2.0 model you want to use:", value = "facebook/wav2vec2-large-960h", key = "model")
|
29 |
|
30 |
+
try:
|
31 |
+
cfg = AutoConfig.from_pretrained(model_id)
|
32 |
+
layer = st.number_input("Select the layer you want to use:",
|
33 |
+
min_value = 1, max_value = cfg.num_hidden_layers, value=10)
|
34 |
+
|
35 |
+
def load_wav2vec2_featurizer(model_id: str, layer: Optional[int] = None):
|
36 |
+
from transformers.models.wav2vec2 import Wav2Vec2Model
|
37 |
+
import soundfile as sf
|
38 |
+
from scipy import signal
|
39 |
+
import torch
|
40 |
+
import numpy as np
|
41 |
+
|
42 |
+
transformers.logging.set_verbosity(transformers.logging.ERROR)
|
43 |
+
|
44 |
+
model_kwargs = {}
|
45 |
+
if layer is not None:
|
46 |
+
model_kwargs["num_hidden_layers"] = layer if layer > 0 else 0
|
47 |
+
|
48 |
+
with st.spinner("Loading..."):
|
49 |
+
model = Wav2Vec2Model.from_pretrained(model_id, **model_kwargs)
|
50 |
+
model.eval()
|
51 |
+
if torch.cuda.is_available():
|
52 |
+
model.cuda()
|
53 |
+
st.success("Done!")
|
54 |
+
|
55 |
+
@torch.no_grad()
|
56 |
+
def _featurize(path):
|
57 |
+
input_values, rate = sf.read(path, dtype=np.float32)
|
58 |
+
if len(input_values.shape) == 2:
|
59 |
+
input_values = input_values.mean(1)
|
60 |
+
if rate != 16_000:
|
61 |
+
new_length = int(input_values.shape[0] / rate * 16_000)
|
62 |
+
input_values = signal.resample(input_values, new_length)
|
63 |
+
|
64 |
+
input_values = torch.from_numpy(input_values).unsqueeze(0)
|
65 |
+
if torch.cuda.is_available():
|
66 |
+
input_values = input_values.cuda()
|
67 |
+
|
68 |
+
if layer is None:
|
69 |
+
hidden_states = model(input_values, output_hidden_states=True).hidden_states
|
70 |
+
hidden_states = [s.squeeze(0).cpu().numpy() for s in hidden_states]
|
71 |
+
return hidden_states
|
72 |
+
|
73 |
+
if layer >= 0:
|
74 |
+
hidden_state = model(input_values).last_hidden_state.squeeze(0).cpu().numpy()
|
75 |
+
else:
|
76 |
+
hidden_state = model.feature_extractor(input_values)
|
77 |
+
hidden_state = hidden_state.transpose(1, 2)
|
78 |
+
if layer == -1:
|
79 |
+
hidden_state = model.feature_projection(hidden_state)
|
80 |
+
hidden_state = hidden_state.squeeze(0).cpu().numpy()
|
81 |
+
|
82 |
+
return hidden_state
|
83 |
+
|
84 |
+
return _featurize
|
85 |
+
|
86 |
+
featurizer_a = load_wav2vec2_featurizer(model_id, layer)
|
87 |
+
except OSError:
|
88 |
+
st.error("Please select a wav2vec 2.0 compatible model identifier on the [Hugging Face Model Hub](https://huggingface.co/models?filter=wav2vec2).")
|
89 |
+
featurizer_a = None
|
90 |
|
91 |
def aligner(x, y) -> Any:
|
92 |
return dtw(x, y, keep_internals=True)
|
|
|
173 |
st.pyplot(fig)
|
174 |
|
175 |
if os.path.isfile("./output/plot.pdf"):
|
176 |
+
st.caption(" Visualization of neural acoustic distances\
|
177 |
+
per frame (based on wav2vec 2.0) with the pronunciation of\
|
178 |
+
of the first filename on the x-axis and distances to the pronunciation\
|
179 |
+
of second filename on the y-axis. The horizontal line represents\
|
180 |
+
the global distance value (i.e. the average of all individual frames).\
|
181 |
+
The blue continuous line represents the moving average distance based on 9 frames,\
|
182 |
+
corresponding to 180ms. As a result of the moving average, the blue line does not cover the entire duration of\
|
183 |
+
the sample. Larger bullet sizes indicate that multiple\
|
184 |
+
frames in the pronunciation on the y-axis are aligned to a single frame in the pronunciation on the x-axis.")
|
|
|
185 |
|
186 |
with open("./output/plot.pdf", "rb") as file:
|
187 |
btn = st.download_button(
|
|
|
189 |
data=file,
|
190 |
file_name="plot.pdf",
|
191 |
mime="image/pdf"
|
192 |
+
)
|