|
--- |
|
license: apache-2.0 |
|
library_name: transformers |
|
language: |
|
- en |
|
- pt |
|
pipeline_tag: translation |
|
--- |
|
# Transformer En-PT (Teeny-Tiny Castle) |
|
|
|
This model is part of a tutorial tied to the [Teeny-Tiny Castle](https://github.com/Nkluge-correa/TeenyTinyCastle), an open-source repository containing educational tools for AI Ethics and Safety research. |
|
|
|
## How to Use |
|
|
|
```python |
|
import tensorflow as tf |
|
import numpy as np |
|
import string |
|
import keras |
|
import re |
|
|
|
strip_chars = string.punctuation |
|
strip_chars = strip_chars.replace("[", "") |
|
strip_chars = strip_chars.replace("]", "") |
|
|
|
|
|
def custom_standardization(input_string): |
|
lowercase = tf.strings.lower(input_string) |
|
return tf.strings.regex_replace(lowercase, f"[{re.escape(strip_chars)}]", "") |
|
|
|
portuguese_vocabulary_path = hf_hub_download( |
|
repo_id="AiresPucrs/transformer-eng-por", |
|
filename="keras_transformer_blocks.py", |
|
repo_type='model', |
|
local_dir="./") |
|
|
|
from keras_transformer_blocks import TransformerEncoder, PositionalEmbedding, TransformerDecoder |
|
|
|
transformer = keras.models.load_model("./transformer-eng-por/transformer-eng-por.h5", |
|
custom_objects={"TransformerEncoder": TransformerEncoder, |
|
"PositionalEmbedding": PositionalEmbedding, |
|
"TransformerDecoder": TransformerDecoder}) |
|
|
|
with open('portuguese_vocabulary.txt', encoding='utf-8', errors='backslashreplace') as fp: |
|
portuguese_vocab = [line.strip() for line in fp] |
|
fp.close() |
|
|
|
with open('english_vocabulary.txt', encoding='utf-8', errors='backslashreplace') as fp: |
|
english_vocab = [line.strip() for line in fp] |
|
fp.close() |
|
|
|
|
|
target_vectorization = tf.keras.layers.TextVectorization(max_tokens=20000, |
|
output_mode="int", |
|
output_sequence_length=21, |
|
standardize=custom_standardization, |
|
vocabulary=portuguese_vocab) |
|
|
|
source_vectorization = tf.keras.layers.TextVectorization(max_tokens=20000, |
|
output_mode="int", |
|
output_sequence_length=20, |
|
vocabulary=english_vocab) |
|
|
|
portuguese_index_lookup = dict(zip(range(len(portuguese_vocab)), portuguese_vocab)) |
|
max_decoded_sentence_length = 20 |
|
|
|
|
|
def decode_sequence(input_sentence): |
|
tokenized_input_sentence = source_vectorization([input_sentence]) |
|
decoded_sentence = "[start]" |
|
|
|
for i in range(max_decoded_sentence_length): |
|
tokenized_target_sentence = target_vectorization([decoded_sentence])[:, :-1] |
|
predictions = transformer([tokenized_input_sentence, tokenized_target_sentence]) |
|
sampled_token_index = np.argmax(predictions[0, i, :]) |
|
sampled_token = portuguese_index_lookup[sampled_token_index] |
|
decoded_sentence += " " + sampled_token |
|
if sampled_token == "[end]": |
|
break |
|
return decoded_sentence |
|
|
|
|
|
eng_sentences =["What is its name?", |
|
"How old are you?", |
|
"I know you know where Mary is.", |
|
"We will show Tom.", |
|
"What do you all do?", |
|
"Don't do it!"] |
|
|
|
for sentence in eng_sentences: |
|
print(f"English sentence:\n{sentence}") |
|
print(f'Portuguese translation:\n{decode_sequence(sentence)}') |
|
print('-' * 50) |
|
``` |
|
|