File size: 1,659 Bytes
b78b52f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
"""
Convert alpaca dataset into sharegpt format.

Usage: python convert_alpaca.py --in_file alpaca_data.json --out_file alpaca_data_sharegpt.json
"""

import argparse

from datasets import load_dataset

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--in_file", type=str)
    parser.add_argument("--out_file", type=str)
    parser.add_argument("--data_type", type=str, default='alpaca')
    args = parser.parse_args()
    print(args)
    data_files = {"train": args.in_file}
    raw_datasets = load_dataset('json', data_files=data_files)
    ds = raw_datasets['train']


    def process_alpaca(examples):
        convs = []
        for instruction, inp, output in zip(examples['instruction'], examples['input'], examples['output']):
            if len(inp.strip()) > 1:
                instruction = instruction + '\n\n' + inp
            q = instruction
            a = output
            convs.append([
                {"from": "human", "value": q},
                {"from": "gpt", "value": a}
            ])
        return {"conversations": convs}


    if args.data_type in ['alpaca']:
        ds = ds.map(process_alpaca, batched=True, remove_columns=ds.column_names, desc="Running process")
    else:
        # Other sharegpt dataset, need rename to conversations and remove unused columns
        if "items" in ds.column_names:
            ds = ds.rename(columns={"items": "conversations"})
        columns_to_remove = ds.column_names.copy()
        columns_to_remove.remove('conversations')
        ds = ds.remove_columns(columns_to_remove)

    ds.to_json(f"{args.out_file}", lines=True, force_ascii=False)