Spaces:
Runtime error
Runtime error
import streamlit as st | |
from transformers import ( | |
AutoTokenizer, | |
XLNetTokenizer | |
) | |
import pathlib | |
import json | |
st.set_page_config(layout='wide') | |
st.title("Transformers library For NLP Tasks : Structured by Topics") | |
st.write("lets start with the architectures of models") | |
neural_net_models = dict({ | |
'encoder': "responsible for understanding the input text.", | |
'decoder': "designed to generate new texts answering queries.", | |
'encoder-decoder': "understand and generate text & have emergent behaviour", | |
'convolution': "used for image recognition and processing.", | |
}) | |
model_types = list(neural_net_models.keys()) | |
archs = st.radio("model architectures".capitalize(), model_types) | |
st.write(f"{archs.capitalize()} are {neural_net_models[archs]}") | |
domains = dict({ | |
"computer_vision": { | |
"encoder": ['vit', 'swin', 'segformer', 'beit'], | |
"decoder": ['imagegpt'], | |
"encoder-decoder": ['detr'], | |
"convolution": ['convnext'] | |
}, | |
"nlp": { | |
"encoder": ["bert", "roberta", "albert", "distillbert", | |
"deberta", "longformer",], | |
"decoder": ["gpt-2", "xlnet", "gpt-j", "opt", "bloom"], | |
"encoder-decoder": ["bart", "pegasus", "t5", ], | |
}, | |
"audio": { | |
"encoder": ["wav2vec2", "hubert"], | |
"encoder-decoder": ["speech2text", "whisper"] | |
}, | |
"multimodal": { | |
"encoder": ["visualbert", "vilt", "clip", "owl-vit"], | |
"encoder-decoder": ["trocr", "donut"] | |
}, | |
"reinforcement": { | |
"decoder": ["trajectory transformer", "decision transformer"] | |
} | |
}) | |
st.write("Lets look at the Individual domains") | |
domain_list = list(domains.keys()) | |
doms = st.radio("domains of ai".capitalize(), domain_list) | |
st.write(domains[doms]) | |
st.write("Now comes the Tokenizers, the Entry Points") | |
tokenizer_algos = { | |
"byte_pair": { | |
"base": ['gpt', 'gpt-2(byte_level)'], | |
"intro": "https://arxiv.org/abs/1508.07909" | |
}, | |
"wordpiece":{ | |
"base": ['bert', 'distilbert', 'electra'], | |
"intro": "https://static.googleusercontent.com/media/research.google.com/ja//pubs/archive/37842.pdf" | |
}, | |
"unigram": { | |
"base": ['not_used'], | |
"intro": "https://arxiv.org/pdf/1804.10959.pdf" | |
}, | |
"sentencepiece":{ | |
"base": ["xlm", "albert", "xlnet", "marian", "t5"], | |
"intro": "https://arxiv.org/pdf/1808.06226.pdf" | |
} | |
} | |
tokenizer_items = list(tokenizer_algos.keys()) | |
algos = st.radio("tokenizer algos".capitalize(), tokenizer_items) | |
st.write(tokenizer_algos[algos]) | |
st.write("""We will work on 3 types of tokenizers on a single sentence | |
to see how their output differs, by first encoding and decoding them too.""") | |
st.markdown("""### Models in Review: | |
- gpt2 | |
- bert-base-uncased | |
- xlm""") | |
input_sentence = "This is a sample sentence for testing tokenizers" | |
gpt2_model = "gpt2" | |
bert_model = "bert-base-uncased" | |
xlm_model = "xlnet-base-cased" | |
gpt2_tokenizer = AutoTokenizer.from_pretrained(gpt2_model) | |
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model) | |
xlm_tokenizer = XLNetTokenizer.from_pretrained(xlm_model) | |
st.markdown("#### The input sentence is") | |
st.write("The Sample Sentence: ", input_sentence) | |
gpt2_tokenize = gpt2_tokenizer.tokenize(input_sentence) | |
bert_tokenize = bert_tokenizer.tokenize(input_sentence) | |
xlm_tokenize = xlm_tokenizer.tokenize(input_sentence) | |
with st.expander(label="Byte Pair Tokenizer", expanded=False): | |
st.write("gpt2_tokenize = gpt2_tokenizer.tokenize(input_sentence)") | |
st.write(gpt2_tokenize) | |
with st.expander(label="Word Piece Tokenizer", expanded=False): | |
st.write("bert_tokenize = bert_tokenizer.tokenize(input_sentence)") | |
st.write(bert_tokenize) | |
with st.expander(label="SentencePiece Tokenizer", expanded=False): | |
st.write("xlm_tokenize = xlm_tokenizer.tokenize(input_sentence)") | |
st.write(xlm_tokenize) | |
st.markdown("""#### Tokenizer Options: | |
There are following parameters in Tokenizer object are most used | |
- padding = 'longest'(True), 'max_length', 'do_not_pad'(False) | |
- truncation = 'longest_first'(True), 'only_second', 'only_first', | |
'do_not_truncate'(False) | |
- max_length = <= model_max_length """) | |
## Refer to https://huggingface.co/docs/transformers/pad_truncation | |
gpt2_max_length = gpt2_tokenizer.model_max_length | |
bert_max_length = bert_tokenizer.model_max_length | |
xlm_max_length = "Not Speced" | |
st.markdown("""We also need the model max length, which is the | |
what the model is configured with.""") | |
st.write("GPT: ", gpt2_max_length) | |
st.write("Bert: ", bert_max_length) | |
st.write("XLM: ", xlm_max_length) | |
sent1 = "This app is talking about the variety of Tokenizers and their outputs" | |
sent2 = """Tokenizers do one thing, bring out numbers from text. The better numbers far better | |
the results""" | |
st.write("We will be working with the following sentences.") | |
st.write("Sentence1: ", sent1) | |
st.write("Sentence2: ", sent2) | |
st.markdown("#### Tokenization in Action. Using GPT Tokenizer") | |
st.markdown("""##### Trial-1: | |
> No parameter provided | |
> Sentences are given with comma seperation""") | |
gpt2_encode = gpt2_tokenizer(sent1, sent2) | |
st.write(gpt2_encode) | |
st.markdown("""##### Trial-2: | |
> No parameter provided | |
> Sentences are made into a List""") | |
gpt2_encode = gpt2_tokenizer([sent1, sent2]) | |
st.write("gpt2_encode = gpt2_tokenizer([sent1, sent2])") | |
st.write(gpt2_encode) | |
# gpt2_tokenizer.add_special_tokens({'pad_token': '[PAD]'}) | |
gpt2_tokenizer.pad_token_id = gpt2_tokenizer.eos_token_id | |
st.markdown("""##### Trial-3: | |
> Need to add pad token to tokenizer, if the model doesn't have. | |
> padding = True | |
> Sentences are made into a List""") | |
gpt2_encode = gpt2_tokenizer([sent1, sent2], padding=True) | |
st.write("gpt2_encode = gpt2_tokenizer([sent1, sent2], padding=True)") | |
st.write(gpt2_encode) | |
st.markdown("""##### Trial-4: | |
> Need to add pad token to tokenizer, if the model doesn't have. | |
> padding = max_length (requires max_length = int) | |
> Sentences are made into a List""") | |
gpt2_encode = gpt2_tokenizer([sent1, sent2], | |
padding=True, | |
max_length=15) | |
st.write("""gpt2_encode = gpt2_tokenizer([sent1, sent2], | |
padding=True, | |
max_length=15""") | |
st.write(gpt2_encode) | |
st.markdown("""##### Trial-5: | |
> truncate = True (requires max_length = int) | |
> Sentences are seperated by a comma | |
Will see total output of 12 token, 6 per sentence""") | |
gpt2_encode = gpt2_tokenizer(sent1, sent2, | |
truncation=True, | |
max_length=12) | |
st.write("""gpt2_encode = gpt2_tokenizer(sent1, sent2, | |
truncation=True, | |
max_length=12)""") | |
st.write(gpt2_encode) | |
st.markdown("""##### Trial-6: | |
> truncate = True (requires max_length = int) | |
> Sentences are made into a list | |
Will have longest first""") | |
gpt2_encode = gpt2_tokenizer([sent1, sent2], | |
truncation=True, | |
max_length=12) | |
st.write("""gpt2_encode = gpt2_tokenizer([sent1, sent2], | |
truncation=True, | |
max_length=12)""") | |
st.write(gpt2_encode) | |
st.markdown("""##### Trial-7: | |
> truncate = only_first | |
> Sentences are made into a list | |
Will have only 8 tokens """) | |
gpt2_encode = gpt2_tokenizer([sent1, sent2], | |
truncation='only_first', | |
max_length=8) | |
st.write("""gpt2_encode = gpt2_tokenizer([sent1, sent2], | |
truncation='only_first', | |
max_length=8)""") | |
st.write(gpt2_encode) | |
st.markdown("""##### Trial-8: | |
> truncate = False (only_second, is erroring out) | |
> Sentences are made into a list | |
No Truncation, 2 ids list""") | |
gpt2_encode = gpt2_tokenizer([sent1, sent2], | |
truncation=False, | |
max_length=7) | |
st.write("""gpt2_encode = gpt2_tokenizer([sent1, sent2], | |
truncation=False, | |
max_length=7)""") | |
st.write(gpt2_encode) | |
curr_dir = pathlib.Path(__file__).parent.resolve() | |
file_loc = curr_dir / "task_arch.json" | |
file_loc = file_loc.resolve() | |
with open(file_loc, 'r') as arch: | |
data = json.load(arch) | |
tasks = list(data.keys()) | |
st.markdown("#### Lets dive into the model architectures...") | |
task = st.radio("The NLP tasks", tasks) | |
task_data = data[task] | |
num_models = len(task_data['architectures']) | |
show_archs = st.slider("How many archs to Show", | |
min_value=4, max_value=num_models) | |
pruned_data = { | |
"architectures": task_data['architectures'][:show_archs], | |
"AutoModelClass": task_data["AutoModelClass"], | |
"dataset": task_data["dataset"], | |
"model_used": task_data["model_used"] | |
} | |
st.write(pruned_data) |