--- license: apache-2.0 --- ## How to use the discriminator in `transformers` on a custom dataset (Heavily based on: https://github.com/huggingface/notebooks/blob/master/examples/text_classification-tf.ipynb) ```python import math import tensorflow as tf from datasets import Dataset, ClassLabel, Features, Value from transformers import TFAutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, create_optimizer # This example shows how this model can be used: # you should finetune the model of your specific corpus if commands, bogger than this dict_train = { "idx": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15"], "sentence": ["e", "get pen", "drop book", "x paper", "i", "south", "get paper", "drop pen", "x book", "inventory", "n", "get book", "drop paper", "examine Pen", "inv", "w"], "label": ["v01835496", "v01214265", "v01977701", "v02131279", "v02472495", "v01835496", "v01214265", "v01977701", "v02131279", "v02472495", "v01835496", "v01214265", "v01977701", "v02131279", "v02472495", "v01835496"] } num_labels = len(set(dict_train["label"])) features = Features({'idx': Value('uint32'), 'sentence': Value('string'), 'label': ClassLabel(names=list(set(dict_train["label"])))}) raw_train_dataset = Dataset.from_dict(dict_train, features=features) discriminator = TFAutoModelForSequenceClassification.from_pretrained("Aureliano/distilbert-base-uncased-if", num_labels=num_labels) tokenizer = AutoTokenizer.from_pretrained("Aureliano/distilbert-base-uncased-if") tokenize_function = lambda example: tokenizer(example["sentence"], truncation=True) pre_tokenizer_columns = set(raw_train_dataset.features) train_dataset = raw_train_dataset.map(tokenize_function, batched=True) tokenizer_columns = list(set(train_dataset.features) - pre_tokenizer_columns) data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf") batch_size = 16 tf_train_dataset = train_dataset.to_tf_dataset( columns=tokenizer_columns, label_cols=["labels"], shuffle=True, batch_size=batch_size, collate_fn=data_collator ) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) num_epochs = 100 batches_per_epoch = math.ceil(len(train_dataset) / batch_size) total_train_steps = int(batches_per_epoch * num_epochs) optimizer, schedule = create_optimizer( init_lr=1e-5, num_warmup_steps=1, num_train_steps=total_train_steps ) discriminator.compile(optimizer=optimizer, loss=loss) discriminator.fit( tf_train_dataset, epochs=num_epochs ) text = "get lamp" encoded_input = tokenizer(text, return_tensors='tf') output = discriminator(encoded_input) prediction = tf.nn.softmax(output["logits"][0], -1) label = dict_train["label"][tf.math.argmax(prediction)] print(text, ":", label) # ideally [v01214265 -> take.v.04 -> "get into one's hands, take physically"], but probably only with a better dataset ```