Spaces:
Sleeping
Sleeping
import json | |
import pickle | |
import gradio as gr | |
import matplotlib.pyplot as plt | |
import numpy as np | |
def plot_scalar_on_scale(scalar_value, distance_type): | |
# Ensure the scalar is within bounds | |
scalar_value = np.clip(scalar_value, 0.0, 1.0) | |
# Create a figure and axis | |
fig, ax = plt.subplots(figsize=(8, 2)) | |
# Create a horizontal gradient (from close to distant) | |
gradient = np.linspace(0, 1, 256).reshape(1, -1) | |
ax.imshow(gradient, extent=[0, 1, 0, 1], aspect='auto', cmap='viridis_r') | |
# Plot the scalar value as a vertical line | |
ax.axvline(x=scalar_value, color='white', lw=5) | |
# Add a dot at the scalar position | |
ax.plot(scalar_value, 0.5, 'o', color='white', markersize=42) | |
ax.text(scalar_value, 0.5, f'{scalar_value:.2f}', color='black', ha='center', va='center', fontsize=14) | |
# Add labels rotated 90 degrees on the sides | |
ax.text(-0.03, 0.5, 'Close', ha='center', va='center', fontsize=14, rotation=90) | |
ax.text(1.03, 0.5, 'Distant', ha='center', va='center', fontsize=14, rotation=270) | |
# Customize the axis | |
ax.set_xticks([]) # Remove x-axis ticks | |
ax.set_yticks([]) # Remove y-axis ticks | |
ax.set_xlim(0, 1) | |
ax.set_ylim(0, 1) | |
ax.set_title(distance_type) | |
# Remove spines for a cleaner look | |
ax.spines['top'].set_visible(False) | |
ax.spines['right'].set_visible(False) | |
ax.spines['bottom'].set_visible(False) | |
ax.spines['left'].set_visible(False) | |
return fig | |
# Show the plot | |
# plt.tight_layout() | |
# plt.show() | |
def load_json_from_path(path): | |
with open(path, "r", encoding="utf8") as f: | |
obj = json.loads(f.read()) | |
return obj | |
class Measurer: | |
def __init__(self): | |
# learned dist | |
tree_lookup_path = "lang_1_to_lang_2_to_l1_dist.json" | |
self.learned_dist_func = load_json_from_path(tree_lookup_path) | |
# tree dist | |
tree_lookup_path = "lang_1_to_lang_2_to_tree_dist.json" | |
self.tree_dist_func = load_json_from_path(tree_lookup_path) | |
# map dist | |
map_lookup_path = "lang_1_to_lang_2_to_map_dist.json" | |
self.map_dist_func = load_json_from_path(map_lookup_path) | |
largest_value_map_dist = 0.0 | |
for _, values in self.map_dist_func.items(): | |
for _, value in values.items(): | |
largest_value_map_dist = max(largest_value_map_dist, value) | |
for key1 in self.map_dist_func: | |
for key2 in self.map_dist_func[key1]: | |
self.map_dist_func[key1][key2] = self.map_dist_func[key1][key2] / largest_value_map_dist | |
# ASP | |
asp_dict_path = "asp_dict.pkl" | |
with open(asp_dict_path, 'rb') as dictfile: | |
asp_sim = pickle.load(dictfile) | |
lang_list = list(asp_sim.keys()) | |
self.asp_dist_func = dict() | |
seen_langs = set() | |
for lang_1 in lang_list: | |
if lang_1 not in seen_langs: | |
seen_langs.add(lang_1) | |
self.asp_dist_func[lang_1] = dict() | |
for index, lang_2 in enumerate(lang_list): | |
if lang_2 not in seen_langs: # it's symmetric | |
self.asp_dist_func[lang_1][lang_2] = 1 - asp_sim[lang_1][index] | |
def get_dists(self, l1, l2): | |
if l1 in self.tree_dist_func: | |
if l2 in self.tree_dist_func[l1]: | |
tree_dist = self.tree_dist_func[l1][l2] | |
else: | |
tree_dist = self.tree_dist_func[l2][l1] | |
else: | |
tree_dist = self.tree_dist_func[l2][l1] | |
if l1 in self.map_dist_func: | |
if l2 in self.map_dist_func[l1]: | |
map_dist = self.map_dist_func[l1][l2] | |
else: | |
map_dist = self.map_dist_func[l2][l1] | |
else: | |
map_dist = self.map_dist_func[l2][l1] | |
try: | |
if l1 in self.asp_dist_func: | |
if l2 in self.asp_dist_func[l1]: | |
asp_dist = self.asp_dist_func[l1][l2] | |
else: | |
asp_dist = self.asp_dist_func[l2][l1] | |
else: | |
asp_dist = self.asp_dist_func[l2][l1] | |
except KeyError: | |
asp_dist = tree_dist # dirty hack, but like 4 codes are not part of phonepiece | |
if l1 in self.learned_dist_func: | |
if l2 in self.learned_dist_func[l1]: | |
learned_dist = self.learned_dist_func[l1][l2] | |
else: | |
learned_dist = self.learned_dist_func[l2][l1] | |
else: | |
learned_dist = self.learned_dist_func[l2][l1] | |
return tree_dist, map_dist, asp_dist, learned_dist | |
def measure(self, l1, l2): | |
if l1 == l2: | |
f1 = plot_scalar_on_scale(0.0, f"Language Family Tree Distance between {l1} and {l2}") | |
f2 = plot_scalar_on_scale(0.0, f"Distance on the Globe between {l1} and {l2}") | |
f3 = plot_scalar_on_scale(0.0, f"Phoneme-Frequency Distance between {l1} and {l2}") | |
f4 = plot_scalar_on_scale(0.0, f"Machine-Learned Distance between {l1} and {l2}") | |
else: | |
tree_dist, map_dist, asp_dist, learned_dist = self.get_dists(l1.split(" ")[-1].split("(")[1].split(")")[0], | |
l2.split(" ")[-1].split("(")[1].split(")")[0]) | |
f1 = plot_scalar_on_scale(tree_dist, f"Language Family Tree Distance between {l1} and {l2}") | |
f2 = plot_scalar_on_scale(map_dist, f"Distance on the Globe between {l1} and {l2}") | |
f3 = plot_scalar_on_scale(asp_dist, f"Phoneme-Frequency Distance between {l1} and {l2}") | |
f4 = plot_scalar_on_scale(learned_dist, f"Machine-Learned Distance between {l1} and {l2}") | |
return f1, f2, f3, f4 | |
m = Measurer() | |
iso_to_name = load_json_from_path("iso_to_fullname.json") | |
text_selection = [f"{iso_to_name[iso_code]} ({iso_code})" for iso_code in iso_to_name] | |
iface = gr.Interface(fn=m.measure, | |
inputs=[gr.Dropdown(text_selection, | |
type="value", | |
value='English (eng)', | |
label="Select the fist Language (type on your keyboard to find it quickly)"), | |
gr.Dropdown(text_selection, | |
type="value", | |
value='German (deu)', | |
label="Select the second Language (type on your keyboard to find it quickly)")], | |
outputs=[gr.Plot(label="", show_label=False, format="png", container=True), | |
gr.Plot(label="", show_label=False, format="png", container=True), | |
gr.Plot(label="", show_label=False, format="png", container=True), | |
gr.Plot(label="", show_label=False, format="png", container=True)], | |
description="<br><br> This demo allows you to view the distance between two languages from the ISO 639-3 list according to several distance measurement functions. " | |
"For more information, check out our paper: https://arxiv.org/abs/2406.06403 and our text-to-speech tool, in which we make use of " | |
"this technique: https://github.com/DigitalPhonetics/IMS-Toucan <br><br>") | |
iface.launch() | |