Spaces:
Runtime error
Runtime error
from datasets import Dataset, DatasetDict | |
import pandas as pd | |
from config import max_length, label2id | |
from model import tokenizer | |
import os | |
import torch | |
def convert_to_stsb_features(example_batch): | |
inputs = example_batch['content'] | |
features = tokenizer.batch_encode_plus( | |
inputs, truncation=True, max_length=max_length, padding='max_length') | |
# features["labels"] = [label2id[i] for i in example_batch["sentiment"]] | |
features["labels"] = [0]*len(example_batch["content"]) #[i for i in range(len(example_batch["content"]))] | |
# features["nid"] = [int(i) for i in example_batch["nid"]] | |
return features | |
def convert_to_features(dataset_dict, convert_func_dict): | |
columns_dict = { | |
"document": ['input_ids', 'attention_mask', 'labels'], | |
# "paragraph": ['input_ids', 'attention_mask', 'labels'], | |
# "sentence": ['input_ids', 'attention_mask', 'labels'], | |
} | |
features_dict = {} | |
for task_name, dataset in dataset_dict.items(): | |
features_dict[task_name] = {} | |
print(task_name) | |
for phase, phase_dataset in dataset.items(): | |
features_dict[task_name][phase] = phase_dataset.map( | |
convert_func_dict[task_name], | |
batched=True, | |
load_from_cache_file=False, | |
) | |
print(task_name, phase, len(phase_dataset), | |
len(features_dict[task_name][phase])) | |
features_dict[task_name][phase].set_format( | |
type="torch", | |
columns=columns_dict[task_name], | |
) | |
print("=>",task_name, phase, len(phase_dataset), | |
len(features_dict[task_name][phase])) | |
return features_dict | |