Spaces:

Kumarkishalaya
/

neural_machine_translation

Runtime error

App Files Files Community

neural_machine_translation / app.py

Kumarkishalaya

Update app.py

b01624d verified 28 days ago

raw

history blame contribute delete

5.44 kB

	import gradio as gr
	from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM, MarianMTModel
	from tensorflow.keras.preprocessing.sequence import pad_sequences
	from tensorflow.keras.models import load_model
	import pickle
	import json
	import keras
	from huggingface_hub import hf_hub_download
	from transformers import pipeline
	import torch
	import os
	import numpy as np


	model_name = "Helsinki-NLP/opus-mt-en-hi"

	model_base_nmt = MarianMTModel.from_pretrained(model_name)
	tokenizer_base_nmt = AutoTokenizer.from_pretrained(model_name)



	# Define the model repository and tokenizer checkpoint
	model_checkpoint = "himanishprak23/neural_machine_translation"
	tokenizer_checkpoint = "Helsinki-NLP/opus-mt-en-hi"

	# Load the tokenizer from Helsinki-NLP and model from Hugging Face repository
	tokenizer_nmt = AutoTokenizer.from_pretrained(tokenizer_checkpoint)
	model_nmt = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

	# Loading models, tokenizer & variables for trained LSTM translation model.
	#repo_id = "Kumarkishalaya/lstm-eng-to-hin"
	#lstm_filename = "seq2seq_model.keras"


	# Re-download the file
	#lstm_model_path = hf_hub_download(repo_id=repo_id, filename=lstm_filename, force_download=True)


	model_lstm = load_model('seq2seq_model.h5')

	with open('eng_tokenizer.pkl', 'rb') as file:
	eng_tokenizer = pickle.load(file)
	with open('hin_tokenizer.pkl', 'rb') as file:
	hin_tokenizer = pickle.load(file)
	max_len_eng = 20
	max_len_hin = 22

	def translate_text_base_nmt(input_text):
	batch = tokenizer_base_nmt([input_text], return_tensors="pt")
	generated_ids = model_base_nmt.generate(**batch)
	predicted_text = tokenizer_base_nmt.batch_decode(generated_ids, skip_special_tokens=True)[0]
	return predicted_text

	def translate_text_nmt(input_text):
	tokenized_input = tokenizer_nmt(input_text, return_tensors='tf', max_length=128, truncation=True)
	generated_tokens = model_nmt.generate(**tokenized_input, max_length=128)
	predicted_text = tokenizer_nmt.decode(generated_tokens[0], skip_special_tokens=True)
	return predicted_text

	def translate_text_lstm(sentence, model, eng_tokenizer, hin_tokenizer, max_len_eng, max_len_hin):
	# Tokenize and pad the input sentence
	input_seq = eng_tokenizer.texts_to_sequences([sentence])
	input_seq = pad_sequences(input_seq, maxlen=max_len_eng, padding='post')

	# Initialize target sequence with start token
	target_seq = np.zeros((1, 1))
	target_seq[0, 0] = hin_tokenizer.word_index['start']

	# Create reverse word index for Hindi
	reverse_word_index = dict([(idx, word) for word, idx in hin_tokenizer.word_index.items()])

	decoded_sentence = []

	for _ in range(max_len_hin):
	output = model.predict([input_seq, target_seq], verbose=0)
	sampled_token_index = np.argmax(output[0, -1, :])
	sampled_word = reverse_word_index.get(sampled_token_index, '')

	if sampled_word == 'end' or sampled_word == '' or len(decoded_sentence) >= max_len_hin - 1:
	break

	decoded_sentence.append(sampled_word)

	# Update target sequence
	target_seq = np.zeros((1, len(decoded_sentence) + 1))
	for t, word in enumerate(decoded_sentence):
	target_seq[0, t] = hin_tokenizer.word_index.get(word, 0) # Use 0 for unknown words
	target_seq[0, len(decoded_sentence)] = sampled_token_index

	return ' '.join(decoded_sentence)


	def translate_text(input_text):
	translation_lstm = translate_text_lstm(input_text, model_lstm, eng_tokenizer, hin_tokenizer, max_len_eng, max_len_hin)
	translation_nmt_base = translate_text_base_nmt(input_text)
	translation_nmt_finetuned = translate_text_nmt(input_text)
	return translation_lstm, translation_nmt_base, translation_nmt_finetuned
	examples = [
	["Microservices Architecture: Containers are ideal for deploying microservices, as each service can run in its own container and be independently managed."],
	["Kubernetes: An open-source container orchestration platform that automates the deployment, scaling, and management of containerized applications."],
	["Machine Learning: The practice of using algorithms to parse data, learn from it, and then make a determination or prediction about something in the world."],
	["Data Science: An interdisciplinary field that uses scientific methods, processes, algorithms, and systems to extract knowledge and insights from noisy, structured, and unstructured data."],
	["DevOps: A set of practices that combines software development and IT operations to shorten the development lifecycle and deliver high-quality software continuously."]

	]

	iface = gr.Interface(
	fn=translate_text,
	inputs=gr.components.Textbox(lines=2, placeholder="Enter text to translate from English to Hindi..."),
	outputs=[
	gr.components.Textbox(label="Translation (LSTM Model)"),
	gr.components.Textbox(label="Translation (Base Helsinki Model)"),
	gr.components.Textbox(label="Translation (Fine-tuned Helsinki Model)")
	],
	title="English to Hindi Translator",
	description="Enter English text and get the Hindi translation from three different models: LSTM, Base Helsinki-NLP, and Fine-tuned Helsinki-NLP. LSTM models both have a single hidden layer trained for 50 epochs, the GPT-2 has been trained for 5 epochs",
	examples=examples
	)

	# Launch the Gradio app
	iface.launch()