Spaces:
Runtime error
Runtime error
File size: 687 Bytes
abca9bf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 |
def drop_duplicates_in_input(untokenized_dataset):
indices_to_keep = []
id_to_idx = {}
outputs = []
for i, (id_, output) in enumerate(zip(untokenized_dataset["id"], untokenized_dataset["output"])):
if id_ in id_to_idx:
outputs[id_to_idx[id_]].append(output)
continue
indices_to_keep.append(i)
id_to_idx[id_] = len(outputs)
outputs.append([output])
untokenized_dataset = untokenized_dataset.select(indices_to_keep).flatten_indices()
untokenized_dataset = untokenized_dataset.remove_columns("output")
untokenized_dataset = untokenized_dataset.add_column("outputs", outputs)
return untokenized_dataset
|