File size: 6,694 Bytes
4d0d76c 609c11f 4d0d76c a4397b2 4d0d76c ab3b5e0 4d0d76c ab3b5e0 4d0d76c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
import torch
import torch.onnx
from transformer import Transformer
import torch
from huggingface_hub import hf_hub_download
import torch
import numpy as np
import gradio as gr
# Generated this by filtering Appendix code
START_TOKEN = '<START>'
PADDING_TOKEN = '<PADDING>'
END_TOKEN = '<END>'
english_vocabulary = [START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
':', '<', '=', '>', '?', '@',
'[', '\\', ']', '^', '_', '`',
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
'y', 'z',
'{', '|', '}', '~', PADDING_TOKEN, END_TOKEN]
gujarati_vocabulary = [
START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
'૦', '૧', '૨', '૩', '૪', '૫', '૬', '૭', '૮', '૯',
':', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`',
'અ', 'આ', 'ઇ', 'ઈ', 'ઉ', 'ઊ', 'ઋ', 'એ', 'ઐ', 'ઓ', 'ઔ',
'ક', 'ખ', 'ગ', 'ઘ', 'ઙ', 'ચ', 'છ', 'જ', 'ઝ', 'ઞ',
'ટ', 'ઠ', 'ડ', 'ઢ', 'ણ', 'ત', 'થ', 'દ', 'ધ', 'ન',
'પ', 'ફ', 'બ', 'ભ', 'મ', 'ય', 'ર', 'લ', 'વ', 'શ',
'ષ', 'સ', 'હ', 'ળ', 'ક્ષ', 'જ્ઞ', 'ં', 'ઃ', 'ઁ', 'ા',
'િ', 'ી', 'ુ', 'ૂ', 'ે', 'ૈ', 'ો', 'ૌ', '્', 'ૐ',
'{', '|', '}', '~', PADDING_TOKEN, END_TOKEN
]
index_to_gujarati = {k:v for k,v in enumerate(gujarati_vocabulary)}
gujarati_to_index = {v:k for k,v in enumerate(gujarati_vocabulary)}
index_to_english = {k:v for k,v in enumerate(english_vocabulary)}
english_to_index = {v:k for k,v in enumerate(english_vocabulary)}
d_model = 512
# batch_size = 64
ffn_hidden = 2048
num_heads = 8
drop_prob = 0.1
num_layers = 6
max_sequence_length = 200
kn_vocab_size = len(gujarati_vocabulary)
# device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device = torch.device('cpu')
transformer = Transformer(d_model,
ffn_hidden,
num_heads,
drop_prob,
num_layers,
max_sequence_length,
kn_vocab_size,
english_to_index,
gujarati_to_index,
START_TOKEN,
END_TOKEN,
PADDING_TOKEN)
model_file = hf_hub_download(repo_id="yashAI007/English_to_Gujarati_Translation", filename="model.pth")
model = torch.load(model_file,map_location='cpu')
transformer.load_state_dict(model['model_state_dict'])
transformer.to(device)
transformer.eval()
NEG_INFTY = -1e9
def create_masks(eng_batch, kn_batch):
num_sentences = len(eng_batch)
look_ahead_mask = torch.full([max_sequence_length, max_sequence_length] , True)
look_ahead_mask = torch.triu(look_ahead_mask, diagonal=1)
encoder_padding_mask = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
decoder_padding_mask_self_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
decoder_padding_mask_cross_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
for idx in range(num_sentences):
eng_sentence_length, kn_sentence_length = len(eng_batch[idx]), len(kn_batch[idx])
eng_chars_to_padding_mask = np.arange(eng_sentence_length + 1, max_sequence_length)
kn_chars_to_padding_mask = np.arange(kn_sentence_length + 1, max_sequence_length)
encoder_padding_mask[idx, :, eng_chars_to_padding_mask] = True
encoder_padding_mask[idx, eng_chars_to_padding_mask, :] = True
decoder_padding_mask_self_attention[idx, :, kn_chars_to_padding_mask] = True
decoder_padding_mask_self_attention[idx, kn_chars_to_padding_mask, :] = True
decoder_padding_mask_cross_attention[idx, :, eng_chars_to_padding_mask] = True
decoder_padding_mask_cross_attention[idx, kn_chars_to_padding_mask, :] = True
encoder_self_attention_mask = torch.where(encoder_padding_mask, NEG_INFTY, 0)
decoder_self_attention_mask = torch.where(look_ahead_mask + decoder_padding_mask_self_attention, NEG_INFTY, 0)
decoder_cross_attention_mask = torch.where(decoder_padding_mask_cross_attention, NEG_INFTY, 0)
return encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask
transformer.eval()
def translate(eng_sentence):
print("English Sentence:",eng_sentence)
eng_sentence = (eng_sentence.lower(),)
kn_sentence = ("",)
for word_counter in range(max_sequence_length):
encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask= create_masks(eng_sentence, kn_sentence)
predictions = transformer(eng_sentence,
kn_sentence,
encoder_self_attention_mask.to(device),
decoder_self_attention_mask.to(device),
decoder_cross_attention_mask.to(device),
enc_start_token=False,
enc_end_token=False,
dec_start_token=True,
dec_end_token=False)
next_token_prob_distribution = predictions[0][word_counter]
next_token_index = torch.argmax(next_token_prob_distribution).item()
next_token = index_to_gujarati[next_token_index]
kn_sentence = (kn_sentence[0] + next_token, )
if next_token == END_TOKEN:
break
print("Gujarati Sentence:",kn_sentence[0][:-5],'\n')
return kn_sentence[0][:-5]
examples = [
["Hello, how are you?"],
["What is your name?"],
["I love programming."],
["This is a beautiful day."],
["Can you help me with this?"],
["What time is it?"],
["I am learning data science."],
["Where is the nearest bus stop?"],
["I enjoy reading books."],
["Thank you for your help."]
]
description = "This tool translates English sentences into Gujarati. Please enter your text above to get started!"
iface = gr.Interface(fn=translate,
inputs="text",
outputs="text",
title="English to Gujarati Translation",
examples=examples,
description=description,
)
if __name__ == "__main__":
iface.launch() |