File size: 3,355 Bytes
9c144a7 65e0eb6 8f1f4af f45d27d 9c144a7 46fe6cd 65e0eb6 46fe6cd 65e0eb6 46fe6cd f45d27d 65e0eb6 46fe6cd 65e0eb6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
---
license: apache-2.0
library_name: transformers
language:
- en
- pt
pipeline_tag: translation
---
# Transformer En-PT (Teeny-Tiny Castle)
This model is part of a tutorial tied to the [Teeny-Tiny Castle](https://github.com/Nkluge-correa/TeenyTinyCastle), an open-source repository containing educational tools for AI Ethics and Safety research.
## How to Use
```python
import tensorflow as tf
import numpy as np
import string
import keras
import re
strip_chars = string.punctuation
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")
def custom_standardization(input_string):
lowercase = tf.strings.lower(input_string)
return tf.strings.regex_replace(lowercase, f"[{re.escape(strip_chars)}]", "")
portuguese_vocabulary_path = hf_hub_download(
repo_id="AiresPucrs/transformer-eng-por",
filename="keras_transformer_blocks.py",
repo_type='model',
local_dir="./")
from keras_transformer_blocks import TransformerEncoder, PositionalEmbedding, TransformerDecoder
transformer = keras.models.load_model("./transformer-eng-por/transformer-eng-por.h5",
custom_objects={"TransformerEncoder": TransformerEncoder,
"PositionalEmbedding": PositionalEmbedding,
"TransformerDecoder": TransformerDecoder})
with open('portuguese_vocabulary.txt', encoding='utf-8', errors='backslashreplace') as fp:
portuguese_vocab = [line.strip() for line in fp]
fp.close()
with open('english_vocabulary.txt', encoding='utf-8', errors='backslashreplace') as fp:
english_vocab = [line.strip() for line in fp]
fp.close()
target_vectorization = tf.keras.layers.TextVectorization(max_tokens=20000,
output_mode="int",
output_sequence_length=21,
standardize=custom_standardization,
vocabulary=portuguese_vocab)
source_vectorization = tf.keras.layers.TextVectorization(max_tokens=20000,
output_mode="int",
output_sequence_length=20,
vocabulary=english_vocab)
portuguese_index_lookup = dict(zip(range(len(portuguese_vocab)), portuguese_vocab))
max_decoded_sentence_length = 20
def decode_sequence(input_sentence):
tokenized_input_sentence = source_vectorization([input_sentence])
decoded_sentence = "[start]"
for i in range(max_decoded_sentence_length):
tokenized_target_sentence = target_vectorization([decoded_sentence])[:, :-1]
predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])
sampled_token_index = np.argmax(predictions[0, i, :])
sampled_token = portuguese_index_lookup[sampled_token_index]
decoded_sentence += " " + sampled_token
if sampled_token == "[end]":
break
return decoded_sentence
eng_sentences =["What is its name?",
"How old are you?",
"I know you know where Mary is.",
"We will show Tom.",
"What do you all do?",
"Don't do it!"]
for sentence in eng_sentences:
print(f"English sentence:\n{sentence}")
print(f'Portuguese translation:\n{decode_sequence(sentence)}')
print('-' * 50)
```
|