Spaces:
Runtime error
Runtime error
import gradio as gr | |
from time import time | |
import torch | |
import os | |
# import nltk | |
import argparse | |
import random | |
import numpy as np | |
import faiss | |
from argparse import Namespace | |
from tqdm.notebook import tqdm | |
from torch.utils.data import DataLoader | |
from functools import partial | |
from sklearn.manifold import TSNE | |
from transformers import AutoTokenizer, MarianTokenizer, AutoModel, AutoModelForSeq2SeqLM, MarianMTModel | |
import os | |
dir_path = os.path.dirname(os.path.realpath(__file__)) | |
print(dir_path) | |
metadata_all = {} | |
model_es = "Helsinki-NLP/opus-mt-en-es" | |
model_fr = "Helsinki-NLP/opus-mt-en-fr" | |
model_zh = "Helsinki-NLP/opus-mt-en-zh" | |
model_ar = "Helsinki-NLP/opus-mt-en-ar" | |
tokenizer_es = AutoTokenizer.from_pretrained(model_es) | |
tokenizer_fr = AutoTokenizer.from_pretrained(model_fr) | |
tokenizer_zh = AutoTokenizer.from_pretrained(model_zh) | |
tokenizer_ar = AutoTokenizer.from_pretrained(model_ar) | |
model_tr_es = MarianMTModel.from_pretrained(model_es) | |
model_tr_fr = MarianMTModel.from_pretrained(model_fr) | |
model_tr_zh = MarianMTModel.from_pretrained(model_zh) | |
model_tr_ar = MarianMTModel.from_pretrained(model_ar) | |
dict_models = { | |
'en-es': model_es, | |
'en-fr': model_fr, | |
'en-zh': model_zh, | |
'en-ar': model_ar, | |
} | |
dict_models_tr = { | |
'en-es': model_tr_es, | |
'en-fr': model_tr_fr, | |
'en-zh': model_tr_zh, | |
'en-ar': model_tr_ar, | |
} | |
dict_tokenizer_tr = { | |
'en-es': tokenizer_es, | |
'en-fr': tokenizer_fr, | |
'en-zh': tokenizer_zh, | |
'en-ar': tokenizer_ar, | |
} | |
from faiss import write_index, read_index | |
import pickle | |
def translation_model(w1,model ): | |
inputs = dict_tokenizer_tr[model](w1, return_tensors="pt") | |
# embeddings = get_tokens_embeddings(inputs, model) | |
input_embeddings = dict_models_tr[model].get_encoder().embed_tokens(inputs.input_ids) | |
# model_tr_es.get_input_embeddings() | |
print(inputs) | |
num_ret_seq = 1 | |
translated = dict_models_tr[model].generate(**inputs, | |
num_beams=5, | |
num_return_sequences=num_ret_seq, | |
return_dict_in_generate=True, | |
output_attentions =False, | |
output_hidden_states = True, | |
output_scores=True,) | |
tgt_text = dict_tokenizer_tr[model].decode(translated.sequences[0], skip_special_tokens=True) | |
target_embeddings = dict_models_tr[model].get_decoder().embed_tokens(translated.sequences) | |
return tgt_text, translated, inputs.input_ids, input_embeddings, target_embeddings | |
def create_vocab_multiple(embeddings_list, model): | |
"""_summary_ | |
Args: | |
embeddings_list (list): embedding array | |
Returns: | |
Dict: vocabulary of tokens' embeddings | |
""" | |
print("START VOCAB CREATION MULTIPLE \n \n ") | |
vocab = {} ## add embedds. | |
sentence_tokens_text_list = [] | |
for embeddings in embeddings_list: | |
tokens_id = embeddings['tokens'] # [[tokens_id]x n_sentences ] | |
for sent_i, sentence in enumerate(tokens_id): | |
sentence_tokens = [] | |
for tok_i, token in enumerate(sentence): | |
sentence_tokens.append(token) | |
if not (token in vocab): | |
vocab[token] = { | |
'token' : token, | |
'count': 1, | |
# 'text': embeddings['texts'][sent_i][tok_i], | |
'text': dict_tokenizer_tr[model].decode([token]), | |
# 'text': src_token_lists[sent_i][tok_i], | |
'embed': embeddings['embeddings'][sent_i][tok_i]} | |
else: | |
vocab[token]['count'] = vocab[token]['count'] + 1 | |
# print(vocab) | |
sentence_tokens_text_list.append(sentence_tokens) | |
print("END VOCAB CREATION MULTIPLE \n \n ") | |
return vocab, sentence_tokens_text_list | |
def vocab_words_all_prefix(token_embeddings, model, sufix="@@",prefix = '▁' ): | |
vocab = {} | |
# inf_model = dict_models_tr[model] | |
sentence_words_text_list = [] | |
if prefix : | |
n_prefix = len(prefix) | |
for input_sentences in token_embeddings: | |
# n_tokens_in_word | |
for sent_i, sentence in enumerate(input_sentences['tokens']): | |
words_text_list = [] | |
# embedding = input_sentences['embed'][sent_i] | |
word = '' | |
tokens_ids = [] | |
embeddings = [] | |
ids_to_tokens = dict_tokenizer_tr[model].convert_ids_to_tokens(sentence) | |
# print("validate same len", len(sentence) == len(ids_to_tokens), len(sentence), len(ids_to_tokens), ids_to_tokens) | |
to_save= False | |
for tok_i, token_text in enumerate(ids_to_tokens): | |
token_id = sentence[tok_i] | |
if token_text[:n_prefix] == prefix : | |
#first we save the previous word | |
if to_save: | |
vocab[word] = { | |
'word' : word, | |
'text': word, | |
'count': 1, | |
'tokens_ids' : tokens_ids, | |
'embed': np.mean(np.array(embeddings), 0).tolist() | |
} | |
words_text_list.append(word) | |
#word is starting if prefix | |
tokens_ids = [token_id] | |
embeddings = [input_sentences['embeddings'][sent_i][tok_i]] | |
word = token_text[n_prefix:] | |
## if word | |
to_save = True | |
else : | |
if (token_text in dict_tokenizer_tr[model].special_tokens_map.values()): | |
# print('final or save', token_text, token_id, to_save, word) | |
if to_save: | |
# vocab[word] = ids | |
vocab[word] = { | |
'word' : word, | |
'text': word, | |
'count': 1, | |
'tokens_ids' : tokens_ids, | |
'embed': np.mean(np.array(embeddings), 0).tolist() | |
} | |
words_text_list.append(word) | |
#special token is one token element, no continuation | |
# vocab[token_text] = [token_id] | |
tokens_ids = [token_id] | |
embeddings = [input_sentences['embeddings'][sent_i][tok_i]] | |
vocab[token_text] = { | |
'word' : token_text, | |
'count': 1, | |
'text': word, | |
'tokens_ids' : tokens_ids, | |
'embed': np.mean(np.array(embeddings), 0).tolist() | |
} | |
words_text_list.append(token_text) | |
to_save = False | |
else: | |
# is a continuation; we do not know if it is final; we don't save here. | |
to_save = True | |
word += token_text | |
tokens_ids.append(token_id) | |
embeddings.append(input_sentences['embeddings'][sent_i][tok_i]) | |
if to_save: | |
# print('final save', token_text, token_id, to_save, word) | |
vocab[word] = tokens_ids | |
if not (word in vocab): | |
vocab[word] = { | |
'word' : word, | |
'count': 1, | |
'text': word, | |
'tokens_ids' : tokens_ids, | |
'embed': np.mean(np.array(embeddings), 0).tolist() | |
} | |
words_text_list.append(word) | |
else: | |
vocab[word]['count'] = vocab[word]['count'] + 1 | |
sentence_words_text_list.append(words_text_list) | |
return vocab, sentence_words_text_list | |
# nb_ids.append(token_values['token']) # for x in vocab_tokens] | |
# nb_embds.append(token_values['embed']) # for x in vocab_tokens] | |
def create_index_voronoi(vocab): | |
""" | |
it returns an index of words and a metadata of ids. | |
""" | |
d = 1024 | |
nb_embds = [] ##ordered embeddings list | |
metadata = {} | |
i_pos = 0 | |
for key_token, token_values in vocab.items(): | |
nb_embds.append(token_values['embed']) # for x in vocab_tokens] | |
metadata[i_pos] = {'token': token_values['token'], 'text': token_values['text']} | |
i_pos += 1 | |
# nb_embds = [x['embed'] for x in vocab_tokens] | |
# print(len(nb_embds),len(nb_embds[0]) ) | |
xb = np.array(nb_embds).astype('float32') #elements to index | |
# ids = np.array(nb_ids) | |
d = len(xb[0]) # dimension of each element | |
nlist = 5 # Nb of Voronois | |
quantizer = faiss.IndexFlatL2(d) | |
index = faiss.IndexIVFFlat(quantizer, d, nlist) | |
index.train(xb) | |
index.add(xb) | |
# index.add(xb) | |
return index, metadata## , nb_embds, nb_ids | |
def create_index_voronoi_words(vocab): | |
""" | |
it returns an index of words and a metadata of ids. | |
""" | |
d = 1024 | |
nb_embds = [] ##ordered embeddings list | |
metadata = {} | |
i_pos = 0 | |
for key_token, token_values in vocab.items(): | |
nb_embds.append(token_values['embed']) # for x in vocab_tokens] | |
metadata[i_pos] = {'word': token_values['word'], 'tokens': token_values['tokens_ids'],'text': token_values['text']} | |
i_pos += 1 | |
# nb_embds = [x['embed'] for x in vocab_tokens] | |
# print(len(nb_embds),len(nb_embds[0]) ) | |
xb = np.array(nb_embds).astype('float32') #elements to index | |
# ids = np.array(nb_ids) | |
d = len(xb[0]) # dimension of each element | |
nlist = 5 # Nb of Voronois | |
quantizer = faiss.IndexFlatL2(d) | |
index = faiss.IndexIVFFlat(quantizer, d, nlist) | |
index.train(xb) | |
index.add(xb) | |
# index.add(xb) | |
return index, metadata## , nb_embds, nb_ids | |
def search_query_vocab(index, vocab_queries, topk = 10, limited_search = []): | |
""" the embed queries are a vocabulary of words : embds_input_voc | |
Args: | |
index (_type_): faiss index | |
embed_queries (_type_): vocab format. | |
{ 'token' : token, | |
'count': 1, | |
'text': src_token_lists[sent_i][tok_i], | |
'embed': embeddings[0]['embeddings'][sent_i][tok_i] } | |
nb_ids (_type_): hash to find the token_id w.r.t the faiss index id. | |
topk (int, optional): nb of similar tokens. Defaults to 10. | |
Returns: | |
_type_: Distance matrix D, indices matrix I and tokens ids (using nb_ids) | |
""" | |
# nb_qi_ids = [] ##ordered ids list | |
nb_q_embds = [] ##ordered embeddings list | |
metadata = {} | |
qi_pos = 0 | |
for key , token_values in vocab_queries.items(): | |
# nb_qi_ids.append(token_values['token']) # for x in vocab_tokens] | |
metadata[qi_pos] = {'word': token_values['word'], 'tokens': token_values['tokens_ids'], 'text': token_values['text']} | |
qi_pos += 1 | |
nb_q_embds.append(token_values['embed']) # for x in vocab_tokens] | |
xq = np.array(nb_q_embds).astype('float32') #elements to query | |
D,I = index.search(xq, topk) | |
return D,I, metadata | |
def search_query_vocab_token(index, vocab_queries, topk = 10, limited_search = []): | |
""" the embed queries are a vocabulary of words : embds_input_vov | |
Returns: | |
_type_: Distance matrix D, indices matrix I and tokens ids (using nb_ids) | |
""" | |
# nb_qi_ids = [] ##ordered ids list | |
nb_q_embds = [] ##ordered embeddings list | |
metadata = {} | |
qi_pos = 0 | |
for key , token_values in vocab_queries.items(): | |
# nb_qi_ids.append(token_values['token']) # for x in vocab_tokens] | |
metadata[qi_pos] = {'token': token_values['token'], 'text': token_values['text']} | |
qi_pos += 1 | |
nb_q_embds.append(token_values['embed']) # for x in vocab_tokens] | |
xq = np.array(nb_q_embds).astype('float32') #elements to query | |
D,I = index.search(xq, topk) | |
return D,I, metadata | |
def build_search(query_embeddings, model,type="input"): | |
global metadata_all | |
# ## biuld vocab for index | |
vocab_queries, sentence_tokens_list = create_vocab_multiple(query_embeddings, model) | |
words_vocab_queries, sentence_words_list = vocab_words_all_prefix(query_embeddings, model, sufix="@@",prefix="▁") | |
index_vor_tokens = metadata_all[type]['tokens'][1] | |
md_tokens = metadata_all[type]['tokens'][2] | |
D, I, meta = search_query_vocab_token(index_vor_tokens, vocab_queries) | |
qi_pos = 0 | |
similar_tokens = {} | |
# similar_tokens = [] | |
for dist, ind in zip(D,I): | |
try: | |
# similar_tokens.append({ | |
similar_tokens[str(meta[qi_pos]['token'])] = { | |
'token': meta[qi_pos]['token'], | |
'text': meta[qi_pos]['text'], | |
# 'text': dict_tokenizer_tr[model].decode(meta[qi_pos]['token']) | |
# 'text': meta[qi_pos]['text'], | |
"similar_topk": [md_tokens[i_index]['token'] for i_index in ind if (i_index != -1) ], | |
"distance": [dist[i] for (i, i_index) in enumerate(ind) if (i_index != -1)], | |
} | |
# ) | |
except: | |
print("\n ERROR ", qi_pos, dist, ind) | |
qi_pos += 1 | |
index_vor_words = metadata_all[type]['words'][1] | |
md_words = metadata_all[type]['words'][2] | |
Dw, Iw, metaw = search_query_vocab(index_vor_words, words_vocab_queries) | |
# D, I, meta, vocab_words, sentence_words_list = result_input['words']# [2] # D ; I ; meta | |
qi_pos = 0 | |
# similar_words = [] | |
similar_words = {} | |
for dist, ind in zip(Dw,Iw): | |
try: | |
# similar_words.append({ | |
similar_words[str(metaw[qi_pos]['word']) ] = { | |
'word': metaw[qi_pos]['word'], | |
'text': metaw[qi_pos]['word'], | |
"similar_topk": [md_words[i_index]['word'] for i_index in ind if (i_index != -1) ], | |
"distance": [dist[i] for (i, i_index) in enumerate(ind) if (i_index != -1)], | |
} | |
# ) | |
except: | |
print("\n ERROR ", qi_pos, dist, ind) | |
qi_pos += 1 | |
return {'tokens': {'D': D, 'I': I, 'meta': meta, 'vocab_queries': vocab_queries, 'similar':similar_tokens, 'sentence_key_list': sentence_tokens_list}, | |
'words': {'D':Dw,'I': Iw, 'meta': metaw, 'vocab_queries':words_vocab_queries, 'sentence_key_list': sentence_words_list, 'similar': similar_words} | |
} | |
def build_reference(all_embeddings, model): | |
# ## biuld vocab for index | |
vocab, sentence_tokens = create_vocab_multiple(all_embeddings,model) | |
words_vocab, sentences = vocab_words_all_prefix(all_embeddings, model, sufix="@@",prefix="▁") | |
index_tokens, meta_tokens = create_index_voronoi(vocab) | |
index_words, meta_words = create_index_voronoi_words(words_vocab) | |
return {'tokens': [vocab, index_tokens, meta_tokens], | |
'words': [words_vocab, index_words, meta_words] | |
} # , index, meta | |
def embds_input_projection_vocab(vocab, key="token"): | |
t0 = time() | |
nb_ids = [] ##ordered ids list | |
nb_embds = [] ##ordered embeddings list | |
nb_text = [] ##ordered embeddings list | |
tnse_error = [] | |
for _ , token_values in vocab.items(): | |
tnse_error.append([0,0]) | |
nb_ids.append(token_values[key]) # for x in vocab_tokens] | |
nb_text.append(token_values['text']) # for x in vocab_tokens] | |
nb_embds.append(token_values['embed']) # for x in vocab_tokens] | |
X = np.array(nb_embds).astype('float32') #elements to project | |
try: | |
tsne = TSNE(random_state=0, n_iter=1000) | |
tsne_results = tsne.fit_transform(X) | |
tsne_results = np.c_[tsne_results, nb_ids, nb_text, range(len(nb_ids))] ## creates a zip array : [[TNSE[X,Y], tokenid, token_text], ...] | |
except: | |
tsne_results = np.c_[tnse_error, nb_ids, nb_text, range(len(nb_ids))] ## creates a zip array : [[TNSE[X,Y], tokenid, token_text], ...] | |
t1 = time() | |
print("t-SNE: %.2g sec" % (t1 - t0)) | |
print(tsne_results) | |
return tsne_results.tolist() | |
def filtered_projection(similar_key, vocab, type="input", key="word"): | |
global metadata_all | |
vocab_proj = vocab.copy() | |
## tnse projection Input words | |
source_words_voc_similar = set() | |
# for words_set in similar_key: | |
for key_i in similar_key: | |
words_set = similar_key[key_i] | |
source_words_voc_similar.update(words_set['similar_topk']) | |
print(len(source_words_voc_similar)) | |
# source_embeddings_filtered = {key: metadata_all['input']['words'][0][key] for key in source_words_voc_similar} | |
source_embeddings_filtered = {key_value: metadata_all[type][key][0][key_value] for key_value in source_words_voc_similar} | |
vocab_proj.update(source_embeddings_filtered) | |
## vocab_proj add | |
try: | |
result_TSNE = embds_input_projection_vocab(vocab_proj, key=key[:-1]) ## singular => without 's' | |
dict_projected_embds_all = {str(embds[2]): [embds[0], embds[1], embds[2], embds[3], embds[4]] for embds in result_TSNE} | |
except: | |
print('TSNE error', type, key) | |
dict_projected_embds_all = {} | |
# print(result_TSNE) | |
return dict_projected_embds_all | |
def first_function(w1, model): | |
global metadata_all | |
#translate and get internal values | |
# print(w1) | |
sentences = w1.split("\n") | |
all_sentences = [] | |
translated_text = '' | |
input_embeddings = [] | |
output_embeddings = [] | |
for sentence in sentences : | |
# print(sentence, end=";") | |
params = translation_model(sentence, model) | |
all_sentences.append(params) | |
# print(len(params)) | |
translated_text += params[0] + ' \n' | |
input_embeddings.append({ | |
'embeddings': params[3].detach(), ## create a vocabulary with the set of embeddings | |
'tokens': params[2].tolist(), # one translation = one sentence | |
# 'texts' : dict_tokenizer_tr[model].decode(params[2].tolist()) | |
}) | |
output_embeddings.append({ | |
'embeddings' : params[4].detach(), | |
'tokens': params[1].sequences.tolist(), | |
# 'texts' : dict_tokenizer_tr[model].decode(params[1].sequences.tolist()) | |
}) | |
# print(input_embeddings) | |
# print(output_embeddings) | |
## Build FAISS index | |
# ---> preload faiss using the respective model with a initial dataset. | |
result_input = build_reference(input_embeddings,model) | |
result_output = build_reference(output_embeddings,model) | |
# print(result_input, result_output) | |
metadata_all = {'input': result_input, 'output': result_output} | |
### get translation | |
return [translated_text, params] | |
def first_function_tr(w1, model, var2={}): | |
global metadata_all | |
#Translate and find similar tokens in token | |
print("SEARCH -- ") | |
sentences = w1.split("\n") | |
all_sentences = [] | |
translated_text = '' | |
input_embeddings = [] | |
output_embeddings = [] | |
for sentence in sentences : | |
# print(sentence, end=";") | |
params = translation_model(sentence, model) | |
all_sentences.append(params) | |
# print(len(params)) | |
translated_text += params[0] + ' \n' | |
input_embeddings.append({ | |
'embeddings': params[3].detach(), ## create a vocabulary with the set of embeddings | |
'tokens': params[2].tolist(), # one translation = one sentence | |
# 'texts' : dict_tokenizer_tr[model].decode(params[2].tolist()[0]) | |
}) | |
output_embeddings.append({ | |
'embeddings' : params[4].detach(), | |
'tokens': params[1].sequences.tolist(), | |
# 'texts' : dict_tokenizer_tr[model].decode(params[1].sequences.tolist()) | |
}) | |
## Build FAISS index | |
# ---> preload faiss using the respective model with a initial dataset. | |
result_search = {} | |
result_search['input'] = build_search(input_embeddings, model, type='input') | |
result_search['output'] = build_search(output_embeddings, model, type='output') | |
# D, I, meta, vocab_words, sentence_words_list = result_input['words']# [2] # D ; I ; meta | |
# md = metadata_all['input']['words'][2] | |
# qi_pos = 0 | |
# similar_words = [] | |
# for dist, ind in zip(D,I): | |
# try: | |
# similar_words.append({ | |
# 'word': meta[qi_pos]['word'], | |
# "similar_topk": [md[i_index]['word'] for i_index in ind if (i_index != -1) ], | |
# "distance": [D[qi_pos][i] for (i, i_index) in enumerate(ind) if (i_index != -1)], | |
# }) | |
# except: | |
# print("\n ERROR ", qi_pos, dist, ind) | |
# qi_pos += 1 | |
# similar_vocab_queries = similar_vocab_queries[3] | |
# result_output = build_search(output_embeddings, model, type="output") | |
## {'tokens': {'D': D, 'I': I, 'meta': meta, 'vocab_queries': vocab_queries, 'similar':similar_tokens}, | |
## 'words': {'D':Dw,'I': Iw, 'meta': metaw, 'vocab_queries':words_vocab_queries, 'sentence_key_list': sentence_words_list, 'similar': similar_words} | |
## } | |
# print(result_input, result_output) | |
# json_out['input']['tokens'] = { 'similar_queries' : result_input['token'][5], # similarity and distance dict. | |
# 'tnse': dict_projected_embds_all, #projected points (all) | |
# 'key_text_list': result_input['token'][4], # current sentences keys | |
# } | |
json_out = {'input': {'tokens': {}, 'words': {}}, 'output': {'tokens': {}, 'words': {}}} | |
dict_projected = {} | |
for type in ['input', 'output']: | |
dict_projected[type] = {} | |
for key in ['tokens', 'words']: | |
similar_key = result_search[type][key]['similar'] | |
vocab = result_search[type][key]['vocab_queries'] | |
dict_projected[type][key] = filtered_projection(similar_key, vocab, type=type, key=key) | |
json_out[type][key]['similar_queries'] = similar_key | |
json_out[type][key]['tnse'] = dict_projected[type][key] | |
json_out[type][key]['key_text_list'] = result_search[type][key]['sentence_key_list'] | |
return [translated_text, [ json_out, json_out['output']['words'], json_out['output']['tokens']] ] | |
from pathlib import Path | |
## First create html and divs | |
html = """ | |
<html> | |
<script async src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js"></script> | |
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/2.0.0/jquery.min"></script> | |
<script async data-require="[email protected]" data-semver="3.5.3" | |
src="//cdnjs.cloudflare.com/ajax/libs/d3/3.5.3/d3.js"></script> | |
<body> | |
<div id="select_div"> | |
<select id="select_type" class="form-select" aria-label="select example" hidden> | |
<option selected value="words">Words</option> | |
<option value="tokens">Tokens</option> | |
</select> | |
</div> | |
<div id="d3_embed_div"> | |
<div class="row"> | |
<div class="col-6"> | |
<div id="d3_embeds_input_words" class="d3_embed words"></div> | |
</div> | |
<div class="col-6"> | |
<div id="d3_embeds_output_words" class="d3_embed words"></div> | |
</div> | |
<div class="col-6"> | |
<div id="d3_embeds_input_tokens" class="d3_embed tokens"></div> | |
</div> | |
<div class="col-6"> | |
<div id="d3_embeds_output_tokens" class="d3_embed tokens"></div> | |
</div> | |
</div> | |
</div> | |
<div id="d3_graph_div"> | |
<div class="row"> | |
<div class="col-4"> | |
<div id="d3_graph_input_words" class="d3_graph words"></div> | |
</div> | |
<div class="col-4"> | |
<div id="similar_input_words" class=""></div> | |
</div> | |
<div class="col-4"> | |
<div id="d3_graph_output_words" class="d3_graph words"></div> | |
<div id="similar_output_words" class="d3_graph words"></div> | |
</div> | |
</div> | |
<div class="row"> | |
<div class="col-6"> | |
<div id="d3_graph_input_tokens" class="d3_graph tokens"></div> | |
<div id="similar_input_tokens" class="d3_graph tokens"></div> | |
</div> | |
<div class="col-6"> | |
<div id="d3_graph_output_tokens" class="d3_graph tokens"></div> | |
<div id="similar_output_tokens" class="d3_graph tokens"></div> | |
</div> | |
</div> | |
</div> | |
</body> | |
</html> | |
""" | |
html0 = """ | |
<html> | |
<script async src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js"></script> | |
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/2.0.0/jquery.min"></script> | |
<script async data-require="[email protected]" data-semver="3.5.3" | |
src="//cdnjs.cloudflare.com/ajax/libs/d3/3.5.3/d3.js"></script> | |
<body> | |
<div id="select_div"> | |
<select id="select_type" class="form-select" aria-label="select example" hidden> | |
<option selected value="words">Words</option> | |
<option value="tokens">Tokens</option> | |
</select> | |
</div> | |
</body> | |
</html> | |
""" | |
html_col1 = """ | |
<div id="d3_graph_input_words" class="d3_graph words"></div> | |
<div id="d3_graph_input_tokens" class="d3_graph tokens"></div> | |
""" | |
html_col2 = """ | |
<div id="similar_input_words" class=""></div> | |
<div id="similar_output_words" class=""></div> | |
<div id="similar_input_tokens" class=" "></div> | |
<div id="similar_output_tokens" class=" "></div> | |
""" | |
html_col3 = """ | |
<div id="d3_graph_output_words" class="d3_graph words"></div> | |
<div id="d3_graph_output_tokens" class="d3_graph tokens"></div> | |
""" | |
# # <div class="row"> | |
# <div class="col-6" id="d3_legend_data_source"> </div> | |
# <div class="col-6" id="d3_legend_similar_source"> </div> | |
# </div> | |
def second_function(w1,j2): | |
# json_value = {'one':1}# return f"{w1['two']} in sentence22..." | |
# to transfer the data to json. | |
print("second_function -- after the js", w1,j2) | |
return "transition to second js function finished." | |
paths = [] | |
def save_index(model) : | |
names = [] | |
with open(model + '_metadata_ref.pkl', 'wb') as f: | |
pickle.dump(metadata_all, f) | |
names.append(model + '_metadata_ref.pkl') | |
for type in ['tokens','words']: | |
for kind in ['input', 'output']: | |
## save index file | |
name = model + "_" + kind + "_"+ type + ".index" | |
write_index(metadata_all[kind][type][1], name) | |
names.append(name) | |
print("in save index done") | |
return gr.File(names) | |
with gr.Blocks(js="plotsjs.js") as demo: | |
gr.Markdown( | |
""" | |
# MAKE NMT Workshop \t `Embeddings representation` | |
""") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
model_radio_c = gr.Radio(choices=['en-es', 'en-zh', 'en-fr', 'en-ar'], value="en-es", label= '', container=False) | |
with gr.Column(scale=2): | |
gr.Markdown( | |
""" | |
### Reference Translation Sentences | |
Enter at least 50 sentences to be used as comparison. | |
This is submitted just once. | |
""") | |
in_text = gr.Textbox(lines=2, label="reference source text") | |
out_text = gr.Textbox(label="reference target text", interactive=False) | |
out_text2 = gr.Textbox(visible=False) | |
var2 = gr.JSON(visible=False) | |
btn = gr.Button("Reference Translation") | |
# save_index_btn = gr.Button("Download reference index") | |
# file_obj = gr.File(label="Input File") | |
# input = file_obj | |
save_index_btn = gr.Button("Generate index files to download ",) | |
tab2_outputs = gr.File() | |
input = tab2_outputs | |
# save_output = gr.Button("Download", link="/file=en-es_input_tokens.index") | |
with gr.Column(scale=3): | |
gr.Markdown( | |
""" | |
### Translation Sentences | |
Sentences to be analysed. | |
""") | |
in_text_tr = gr.Textbox(lines=2, label="source text") | |
out_text_tr = gr.Textbox(label="target text", interactive=False) | |
out_text2_tr = gr.Textbox(visible=False) | |
var2_tr = gr.JSON(visible=False) | |
btn_faiss= gr.Button("Translation ") | |
gr.Button("Download", link="/file=en-es_input_tokens.index") | |
with gr.Row(): | |
# input_mic = gr.HTML(html) | |
with gr.Column(scale=1): | |
input_mic = gr.HTML(html0) | |
input_html2 = gr.HTML(html_col2) | |
with gr.Column(scale=2): | |
input_html1 = gr.HTML(html_col1) | |
# with gr.Column(scale=2): | |
with gr.Column(scale=2): | |
input_html3 = gr.HTML(html_col3) | |
## first function input w1, model ; return out_text, var2; it does first function and js; | |
btn.click(first_function, [in_text, model_radio_c], [out_text,var2], js="(in_text,model_radio_c) => testFn_out(in_text,model_radio_c)") #should return an output comp. | |
btn_faiss.click(first_function_tr, [in_text_tr, model_radio_c], [out_text_tr,var2_tr], js="(in_text_tr,model_radio_c) => testFn_out(in_text_tr,model_radio_c)") #should return an output comp. | |
## second function input out_text(returned in first_function), [json]var2(returned in first_function) ; | |
## second function returns out_text2, var2; it does second function and js(with the input params); | |
out_text.change(second_function, [out_text, var2], out_text2, js="(out_text,var2) => testFn_out_json(var2)") # | |
out_text_tr.change(second_function, [out_text_tr, var2_tr], out_text2_tr, js="(out_text_tr,var2_tr) => testFn_out_json_tr(var2_tr)") # | |
save_index_btn.click(save_index, [model_radio_c], [tab2_outputs]) | |
# tab2_submit_button.click(func2, | |
# inputs=tab2_inputs, | |
# outputs=tab2_outputs) | |
# run script function on load, | |
# demo.load(None,None,None,js="plotsjs.js") | |
# allowed_paths | |
if __name__ == "__main__": | |
demo.launch(allowed_paths=["./", ".", "/"]) |