In [44]:
!pip install pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [45]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


https://huggingface.co/datasets/amazon_reviews_multi/viewer/all_languages/train

https://stackoverflow.com/questions/70814490/uploading-models-with-custom-forward-functions-to-the-huggingface-model-hub

https://huggingface.co/luisu0124/Amazon_review/tree/main

In [46]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [47]:
import tqdm

from datasets import load_dataset
import transformers
from transformers import AutoTokenizer, AutoModel, BertConfig
from transformers import AdamW
from transformers import get_scheduler

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# setting device to `cuda` if gpu exists
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# initialising the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("google/bert_uncased_L-2_H-128_A-2")
#tokenizer = AutoTokenizer.from_pretrained("pysentimiento/robertuito-sentiment-analysis")
#bert = AutoModel.from_pretrained("google/bert_uncased_L-2_H-128_A-2")
bert = AutoModel.from_pretrained("google/bert_uncased_L-2_H-128_A-2")


Some weights of the model checkpoint at google/bert_uncased_L-2_H-128_A-2 were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Cargue de dataset

In [48]:
def tokenize_function(examples):
    '''Function for tokenizing raw texts'''
    return tokenizer(examples["review_body"], padding="max_length", truncation=True, max_length=128)
    #return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)


# downloading IMDB dataset from ðŸ¤— `datasets`
#raw_datasets = load_dataset("amazon_reviews_multi")
raw_datasets = load_dataset("amazon_reviews_multi","es")



Reusing dataset amazon_reviews_multi (/root/.cache/huggingface/datasets/amazon_reviews_multi/es/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609)


  0%|          | 0/3 [00:00<?, ?it/s]

In [49]:
# Running tokenizing function on the raw texts
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

# for simplicity I have taken only the train split
tokenized_datasets = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))

Loading cached processed dataset at /root/.cache/huggingface/datasets/amazon_reviews_multi/es/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609/cache-46cf96799dcd2584.arrow


  0%|          | 0/5 [00:00<?, ?ba/s]

Loading cached processed dataset at /root/.cache/huggingface/datasets/amazon_reviews_multi/es/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609/cache-69ce6d7f8f0abb0e.arrow
Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/amazon_reviews_multi/es/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609/cache-d0478a74f9a092bf.arrow


In [50]:

# Now lets create the torch Dataset class
class ClassificationDataset(Dataset):

    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        d = self.dataset[idx]

        ids = torch.tensor(d['input_ids'])
        mask = torch.tensor(d['attention_mask'])
        label = torch.tensor(d['stars'])
        #label = torch.tensor(d['label'])
        return ids, mask, label


In [51]:

# Preparing the dataset and the Dataloader
dataset = ClassificationDataset(tokenized_datasets)
train_dataloader = DataLoader(dataset, shuffle=True, batch_size=8)


In [52]:

# Now lets create a custom Bert model
class CustomBert(transformers.PreTrainedModel):
    '''Custom model class
       ------------------
       Now the trick is not to inherit the class from `nn.Module` but `transformers.PretrainedModel`
       Also you need to pass the model config during initialisation'''

    def __init__(self, bert):
        super(CustomBert, self).__init__(config=BertConfig.from_pretrained('google/bert_uncased_L-2_H-128_A-2'))
        self.bert = bert

        self.l1 = nn.Linear(128, 1)

        self.do = nn.Dropout(0.1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, sent_id, mask):
        '''For simplicity I have added only one linear layer, you can create any type of network you want'''
        
        bert_out = self.bert(sent_id, attention_mask=mask)
        o = bert_out.last_hidden_state[:,0,:]
        o = self.do(o)
        o = self.relu(o)
        o = self.l1(o)
        o = self.sigmoid(o)
        return o



In [53]:
# initialising model, loss and optimizer
model = CustomBert(bert)
model.to(device)
criterion = torch.nn.BCELoss()
optimizer = AdamW(model.parameters(), lr=5e-5)




In [54]:

# setting epochs, num_training_steps and the lr_scheduler
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)


In [55]:

# training loop
model.train()
for epoch in tqdm.tqdm(range(num_epochs)):
    for batch in train_dataloader:
        ids, masks, labels = batch
        labels = labels.type(torch.float32)
        o = model(ids.to(device), masks.to(device))
        loss = criterion(torch.squeeze(o), labels.to(device))
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

# save the tokenizer and the model in `./test-model/` directory 
tokenizer.save_pretrained("/content/drive/MyDrive/Models/amazon_reviews")
model.save_pretrained("/content/drive/MyDrive/Models/amazon_reviews", push_to_hub=False)


  0%|          | 0/3 [00:00<?, ?it/s][A
 33%|â–ˆâ–ˆâ–ˆâ–Ž      | 1/3 [00:17<00:34, 17.41s/it][A
 67%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–‹   | 2/3 [00:34<00:17, 17.27s/it][A
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3/3 [00:51<00:00, 17.23s/it]


In [61]:
from transformers import pipeline
classifier = pipeline('text-classification', model='luisu0124/Amazon_review')

Some weights of the model checkpoint at luisu0124/Amazon_review were not used when initializing BertForSequenceClassification: ['l1.weight', 'l1.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at luisu0124/Amazon_review and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [62]:
classifier("Esta review es muy buena")

[{'label': 'POSITIVE', 'score': 0.5269547700881958}]

In [58]:
classifier("Este producto es bueno pero a su vez es malo")

[{'label': 'NEGATIVE', 'score': 0.5181595683097839}]

In [59]:
classifier("Excelente justo que buscaba")

[{'label': 'NEGATIVE', 'score': 0.5213820338249207}]

In [60]:
classifier("odio")

[{'label': 'NEGATIVE', 'score': 0.5219336152076721}]