|
import torch |
|
import torch.onnx |
|
from transformer import Transformer |
|
import torch |
|
from huggingface_hub import hf_hub_download |
|
import torch |
|
import numpy as np |
|
import gradio as gr |
|
|
|
|
|
|
|
START_TOKEN = '<START>' |
|
PADDING_TOKEN = '<PADDING>' |
|
END_TOKEN = '<END>' |
|
|
|
|
|
english_vocabulary = [START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', |
|
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', |
|
':', '<', '=', '>', '?', '@', |
|
'[', '\\', ']', '^', '_', '`', |
|
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', |
|
'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', |
|
'y', 'z', |
|
'{', '|', '}', '~', PADDING_TOKEN, END_TOKEN] |
|
|
|
|
|
gujarati_vocabulary = [ |
|
START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', |
|
'૦', '૧', '૨', '૩', '૪', '૫', '૬', '૭', '૮', '૯', |
|
':', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', |
|
'અ', 'આ', 'ઇ', 'ઈ', 'ઉ', 'ઊ', 'ઋ', 'એ', 'ઐ', 'ઓ', 'ઔ', |
|
'ક', 'ખ', 'ગ', 'ઘ', 'ઙ', 'ચ', 'છ', 'જ', 'ઝ', 'ઞ', |
|
'ટ', 'ઠ', 'ડ', 'ઢ', 'ણ', 'ત', 'થ', 'દ', 'ધ', 'ન', |
|
'પ', 'ફ', 'બ', 'ભ', 'મ', 'ય', 'ર', 'લ', 'વ', 'શ', |
|
'ષ', 'સ', 'હ', 'ળ', 'ક્ષ', 'જ્ઞ', 'ં', 'ઃ', 'ઁ', 'ા', |
|
'િ', 'ી', 'ુ', 'ૂ', 'ે', 'ૈ', 'ો', 'ૌ', '્', 'ૐ', |
|
'{', '|', '}', '~', PADDING_TOKEN, END_TOKEN |
|
] |
|
|
|
index_to_gujarati = {k:v for k,v in enumerate(gujarati_vocabulary)} |
|
gujarati_to_index = {v:k for k,v in enumerate(gujarati_vocabulary)} |
|
index_to_english = {k:v for k,v in enumerate(english_vocabulary)} |
|
english_to_index = {v:k for k,v in enumerate(english_vocabulary)} |
|
|
|
d_model = 512 |
|
|
|
ffn_hidden = 2048 |
|
num_heads = 8 |
|
drop_prob = 0.1 |
|
num_layers = 6 |
|
max_sequence_length = 200 |
|
kn_vocab_size = len(gujarati_vocabulary) |
|
|
|
device = torch.device('cpu') |
|
|
|
transformer = Transformer(d_model, |
|
ffn_hidden, |
|
num_heads, |
|
drop_prob, |
|
num_layers, |
|
max_sequence_length, |
|
kn_vocab_size, |
|
english_to_index, |
|
gujarati_to_index, |
|
START_TOKEN, |
|
END_TOKEN, |
|
PADDING_TOKEN) |
|
|
|
model_file = hf_hub_download(repo_id="yashAI007/English_to_Gujarati_Translation", filename="model.pth") |
|
model = torch.load(model_file,map_location='cpu') |
|
transformer.load_state_dict(model['model_state_dict']) |
|
transformer.to(device) |
|
transformer.eval() |
|
|
|
|
|
NEG_INFTY = -1e9 |
|
|
|
def create_masks(eng_batch, kn_batch): |
|
num_sentences = len(eng_batch) |
|
look_ahead_mask = torch.full([max_sequence_length, max_sequence_length] , True) |
|
look_ahead_mask = torch.triu(look_ahead_mask, diagonal=1) |
|
encoder_padding_mask = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False) |
|
decoder_padding_mask_self_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False) |
|
decoder_padding_mask_cross_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False) |
|
|
|
for idx in range(num_sentences): |
|
eng_sentence_length, kn_sentence_length = len(eng_batch[idx]), len(kn_batch[idx]) |
|
eng_chars_to_padding_mask = np.arange(eng_sentence_length + 1, max_sequence_length) |
|
kn_chars_to_padding_mask = np.arange(kn_sentence_length + 1, max_sequence_length) |
|
encoder_padding_mask[idx, :, eng_chars_to_padding_mask] = True |
|
encoder_padding_mask[idx, eng_chars_to_padding_mask, :] = True |
|
decoder_padding_mask_self_attention[idx, :, kn_chars_to_padding_mask] = True |
|
decoder_padding_mask_self_attention[idx, kn_chars_to_padding_mask, :] = True |
|
decoder_padding_mask_cross_attention[idx, :, eng_chars_to_padding_mask] = True |
|
decoder_padding_mask_cross_attention[idx, kn_chars_to_padding_mask, :] = True |
|
|
|
encoder_self_attention_mask = torch.where(encoder_padding_mask, NEG_INFTY, 0) |
|
decoder_self_attention_mask = torch.where(look_ahead_mask + decoder_padding_mask_self_attention, NEG_INFTY, 0) |
|
decoder_cross_attention_mask = torch.where(decoder_padding_mask_cross_attention, NEG_INFTY, 0) |
|
return encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask |
|
|
|
transformer.eval() |
|
def translate(eng_sentence): |
|
print("English Sentence:",eng_sentence) |
|
eng_sentence = (eng_sentence.lower(),) |
|
kn_sentence = ("",) |
|
for word_counter in range(max_sequence_length): |
|
encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask= create_masks(eng_sentence, kn_sentence) |
|
predictions = transformer(eng_sentence, |
|
kn_sentence, |
|
encoder_self_attention_mask.to(device), |
|
decoder_self_attention_mask.to(device), |
|
decoder_cross_attention_mask.to(device), |
|
enc_start_token=False, |
|
enc_end_token=False, |
|
dec_start_token=True, |
|
dec_end_token=False) |
|
next_token_prob_distribution = predictions[0][word_counter] |
|
next_token_index = torch.argmax(next_token_prob_distribution).item() |
|
next_token = index_to_gujarati[next_token_index] |
|
kn_sentence = (kn_sentence[0] + next_token, ) |
|
if next_token == END_TOKEN: |
|
break |
|
print("Gujarati Sentence:",kn_sentence[0][:-5],'\n') |
|
return kn_sentence[0][:-5] |
|
|
|
examples = [ |
|
["Hello, how are you?"], |
|
["What is your name?"], |
|
["I love programming."], |
|
["This is a beautiful day."], |
|
["Can you help me with this?"], |
|
["What time is it?"], |
|
["I am learning data science."], |
|
["Where is the nearest bus stop?"], |
|
["I enjoy reading books."], |
|
["Thank you for your help."] |
|
] |
|
|
|
description = "This tool translates English sentences into Gujarati. Please enter your text above to get started!" |
|
|
|
iface = gr.Interface(fn=translate, |
|
inputs="text", |
|
outputs="text", |
|
title="English to Gujarati Translation", |
|
examples=examples, |
|
description=description, |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch() |