Spaces:
Running
Running
Update neural_acoustic_distance.py
Browse files- neural_acoustic_distance.py +24 -24
neural_acoustic_distance.py
CHANGED
@@ -112,11 +112,11 @@ def main():
|
|
112 |
|
113 |
st.write(
|
114 |
"This tool visualizes pronunciation differences between two recordings of the same word. The two recordings have to be wave files containing a single spoken word. \n\n\
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
|
121 |
st.subheader("Model selection:")
|
122 |
|
@@ -208,28 +208,28 @@ def main():
|
|
208 |
plt_id = randrange(0, 10)
|
209 |
plt.savefig("./output/plot" + str(plt_id) + ".pdf")
|
210 |
st.pyplot(fig)
|
211 |
-
|
212 |
-
print('7. Plot filled', datetime.now().strftime('%d-%m-%Y %H:%M:%S')) # test
|
213 |
-
|
214 |
-
if os.path.isfile("./output/plot.pdf"):
|
215 |
-
st.caption(" Visualization of neural acoustic distances\
|
216 |
-
per frame (based on wav2vec 2.0) with the pronunciation of\
|
217 |
-
the first filename on the x-axis and distances to the pronunciation\
|
218 |
-
of second filename on the y-axis. The horizontal line represents\
|
219 |
-
the global distance value (i.e. the average of all individual frames).\
|
220 |
-
The blue continuous line represents the moving average distance based on 9 frames,\
|
221 |
-
corresponding to 180ms. As a result of the moving average, the blue line does not cover the entire duration of\
|
222 |
-
the sample. Larger bullet sizes indicate that multiple\
|
223 |
-
frames in the pronunciation on the y-axis are aligned to a single frame in the pronunciation on the x-axis.")
|
224 |
-
|
225 |
-
with open("./output/plot.pdf", "rb") as file:
|
226 |
-
btn = st.download_button(label="Download plot", data=file, file_name="plot.pdf", mime="image/pdf")
|
227 |
-
|
228 |
-
print('8. End', datetime.now().strftime('%d-%m-%Y %H:%M:%S')) # test
|
229 |
-
print(f"9. RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB") # test
|
230 |
|
231 |
main()
|
232 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
233 |
for name in dir():
|
234 |
if not name.startswith('_'):
|
235 |
del globals()[name]
|
|
|
112 |
|
113 |
st.write(
|
114 |
"This tool visualizes pronunciation differences between two recordings of the same word. The two recordings have to be wave files containing a single spoken word. \n\n\
|
115 |
+
Choose any wav2vec 2.0 compatible model identifier on the [Hugging Face Model Hub](https://huggingface.co/models?filter=wav2vec2) and select the output layer you want to use.\n\n\
|
116 |
+
To upload your own recordings select 'custom upload' in the audio file selection step. The first recording is put on the x-axis of the plot and the second one will be the reference recording for computing distance.\n\
|
117 |
+
You should already see an example plot of two sample recordings.\n\n\
|
118 |
+
This visualization tool is part of [neural representations for modeling variation in speech](https://doi.org/10.1016/j.wocn.2022.101137). \n\
|
119 |
+
Please see our paper for further details.")
|
120 |
|
121 |
st.subheader("Model selection:")
|
122 |
|
|
|
208 |
plt_id = randrange(0, 10)
|
209 |
plt.savefig("./output/plot" + str(plt_id) + ".pdf")
|
210 |
st.pyplot(fig)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
|
212 |
main()
|
213 |
|
214 |
+
print('7. Plot filled', datetime.now().strftime('%d-%m-%Y %H:%M:%S')) # test
|
215 |
+
|
216 |
+
if os.path.isfile("./output/plot.pdf"):
|
217 |
+
st.caption(" Visualization of neural acoustic distances\
|
218 |
+
per frame (based on wav2vec 2.0) with the pronunciation of\
|
219 |
+
the first filename on the x-axis and distances to the pronunciation\
|
220 |
+
of second filename on the y-axis. The horizontal line represents\
|
221 |
+
the global distance value (i.e. the average of all individual frames).\
|
222 |
+
The blue continuous line represents the moving average distance based on 9 frames,\
|
223 |
+
corresponding to 180ms. As a result of the moving average, the blue line does not cover the entire duration of\
|
224 |
+
the sample. Larger bullet sizes indicate that multiple\
|
225 |
+
frames in the pronunciation on the y-axis are aligned to a single frame in the pronunciation on the x-axis.")
|
226 |
+
|
227 |
+
with open("./output/plot.pdf", "rb") as file:
|
228 |
+
btn = st.download_button(label="Download plot", data=file, file_name="plot.pdf", mime="image/pdf")
|
229 |
+
|
230 |
+
print('8. End', datetime.now().strftime('%d-%m-%Y %H:%M:%S')) # test
|
231 |
+
print(f"9. RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB") # test
|
232 |
+
|
233 |
for name in dir():
|
234 |
if not name.startswith('_'):
|
235 |
del globals()[name]
|