Spaces:

GroNLP
/

neural-acoustic-distance

Running

App Files Files Community

Martijn Bartelds commited on Mar 10, 2022

Commit

575567c

1 Parent(s): 9c307a0

Update app

Browse files

Files changed (1) hide show

neural_acoustic_distance.py +70 -71

neural_acoustic_distance.py CHANGED Viewed

@@ -27,66 +27,66 @@ model_id = st.selectbox(
 if model_id == "other":
     model_id = st.text_input("Enter the wav2vec 2.0 model you want to use:", value = "facebook/wav2vec2-large-960h", key = "model")
-# try:
-cfg = AutoConfig.from_pretrained(model_id)
-layer = st.number_input("Select the layer you want to use:",
-    min_value = 1, max_value = cfg.num_hidden_layers, value=10)
-def load_wav2vec2_featurizer(model_id: str, layer: Optional[int] = None):
-    from transformers.models.wav2vec2 import Wav2Vec2Model
-    import soundfile as sf
-    from scipy import signal
-    import torch
-    import numpy as np
-    transformers.logging.set_verbosity(transformers.logging.ERROR)
-    model_kwargs = {}
-    if layer is not None:
-        model_kwargs["num_hidden_layers"] = layer if layer > 0 else 0
-    with st.spinner("Loading..."):
-        model = Wav2Vec2Model.from_pretrained(model_id, **model_kwargs)
-        model.eval()
-        if torch.cuda.is_available():
-            model.cuda()
-    st.success("Done!")
-    @torch.no_grad()
-    def _featurize(path):
-        input_values, rate = sf.read(path, dtype=np.float32)
-        if len(input_values.shape) == 2:
-            input_values = input_values.mean(1)
-        if rate != 16_000:
-            new_length = int(input_values.shape[0] / rate * 16_000)
-            input_values = signal.resample(input_values, new_length)
-        input_values = torch.from_numpy(input_values).unsqueeze(0)
-        if torch.cuda.is_available():
-            input_values = input_values.cuda()
-        if layer is None:
-            hidden_states = model(input_values, output_hidden_states=True).hidden_states
-            hidden_states = [s.squeeze(0).cpu().numpy() for s in hidden_states]
-            return hidden_states
-        if layer >= 0:
-            hidden_state = model(input_values).last_hidden_state.squeeze(0).cpu().numpy()
-        else:
-            hidden_state = model.feature_extractor(input_values)
-            hidden_state = hidden_state.transpose(1, 2)
-            if layer == -1:
-                hidden_state = model.feature_projection(hidden_state)
-            hidden_state = hidden_state.squeeze(0).cpu().numpy()
-        return hidden_state
-    return _featurize
-featurizer_a = load_wav2vec2_featurizer(model_id, layer)
-# except OSError:
-    # st.error("Please select a wav2vec 2.0 compatible model identifier on the [Hugging Face Model Hub](https://huggingface.co/models?filter=wav2vec2).")
-    # featurizer_a = None
 def aligner(x, y) -> Any:
     return dtw(x, y, keep_internals=True)
@@ -173,16 +173,15 @@ if filename_x is not None and filename_y is not None and featurizer_a is not Non
     st.pyplot(fig)
     if os.path.isfile("./output/plot.pdf"):
-        if st.button("Info"):
-            st.write(" Visualization of neural acoustic distances\
-            per frame (based on wav2vec 2.0) with the pronunciation of\
-            of the first filename on the x-axis and distances to the pronunciation\
-            of second filename on the y-axis. The horizontal line represents\
-            the global distance value (i.e. the average of all individual frames).\
-            The blue continuous line represents the moving average distance based on 9 frames,\
-            corresponding to 180ms. As a result of the moving average, the blue line does not cover the entire duration of\
-            the sample. Larger bullet sizes indicate that multiple\
-            frames in the pronunciation on the y-axis are aligned to a single frame in the pronunciation on the x-axis.")
     with open("./output/plot.pdf", "rb") as file:
         btn = st.download_button(
@@ -190,4 +189,4 @@ if filename_x is not None and filename_y is not None and featurizer_a is not Non
                 data=file,
                 file_name="plot.pdf",
                 mime="image/pdf"
-            )

 if model_id == "other":
     model_id = st.text_input("Enter the wav2vec 2.0 model you want to use:", value = "facebook/wav2vec2-large-960h", key = "model")
+try:
+    cfg = AutoConfig.from_pretrained(model_id)
+    layer = st.number_input("Select the layer you want to use:",
+        min_value = 1, max_value = cfg.num_hidden_layers, value=10)
+    def load_wav2vec2_featurizer(model_id: str, layer: Optional[int] = None):
+        from transformers.models.wav2vec2 import Wav2Vec2Model
+        import soundfile as sf
+        from scipy import signal
+        import torch
+        import numpy as np
+        transformers.logging.set_verbosity(transformers.logging.ERROR)
+        model_kwargs = {}
+        if layer is not None:
+            model_kwargs["num_hidden_layers"] = layer if layer > 0 else 0
+        with st.spinner("Loading..."):
+            model = Wav2Vec2Model.from_pretrained(model_id, **model_kwargs)
+            model.eval()
+            if torch.cuda.is_available():
+                model.cuda()
+        st.success("Done!")
+        @torch.no_grad()
+        def _featurize(path):
+            input_values, rate = sf.read(path, dtype=np.float32)
+            if len(input_values.shape) == 2:
+                input_values = input_values.mean(1)
+            if rate != 16_000:
+                new_length = int(input_values.shape[0] / rate * 16_000)
+                input_values = signal.resample(input_values, new_length)
+            input_values = torch.from_numpy(input_values).unsqueeze(0)
+            if torch.cuda.is_available():
+                input_values = input_values.cuda()
+            if layer is None:
+                hidden_states = model(input_values, output_hidden_states=True).hidden_states
+                hidden_states = [s.squeeze(0).cpu().numpy() for s in hidden_states]
+                return hidden_states
+            if layer >= 0:
+                hidden_state = model(input_values).last_hidden_state.squeeze(0).cpu().numpy()
+            else:
+                hidden_state = model.feature_extractor(input_values)
+                hidden_state = hidden_state.transpose(1, 2)
+                if layer == -1:
+                    hidden_state = model.feature_projection(hidden_state)
+                hidden_state = hidden_state.squeeze(0).cpu().numpy()
+            return hidden_state
+        return _featurize
+    featurizer_a = load_wav2vec2_featurizer(model_id, layer)
+except OSError:
+    st.error("Please select a wav2vec 2.0 compatible model identifier on the [Hugging Face Model Hub](https://huggingface.co/models?filter=wav2vec2).")
+    featurizer_a = None
 def aligner(x, y) -> Any:
     return dtw(x, y, keep_internals=True)
     st.pyplot(fig)
     if os.path.isfile("./output/plot.pdf"):
+        st.caption(" Visualization of neural acoustic distances\
+        per frame (based on wav2vec 2.0) with the pronunciation of\
+        of the first filename on the x-axis and distances to the pronunciation\
+        of second filename on the y-axis. The horizontal line represents\
+        the global distance value (i.e. the average of all individual frames).\
+        The blue continuous line represents the moving average distance based on 9 frames,\
+        corresponding to 180ms. As a result of the moving average, the blue line does not cover the entire duration of\
+        the sample. Larger bullet sizes indicate that multiple\
+        frames in the pronunciation on the y-axis are aligned to a single frame in the pronunciation on the x-axis.")
     with open("./output/plot.pdf", "rb") as file:
         btn = st.download_button(
                 data=file,
                 file_name="plot.pdf",
                 mime="image/pdf"
+            )