Muennighoff commited on
Commit
9a2e74d
·
1 Parent(s): 7b630b2
Files changed (3) hide show
  1. launch.sh +53 -0
  2. sbatch_mtf_ru.sh +147 -0
  3. train_ru.txt +1 -0
launch.sh ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Launch script using torch.distributed.run(). Used by slurm
4
+ # scripts, don't invoke directly.
5
+
6
+ # Samuel's fix for apparent error in SLURM initialization
7
+ if [ $SLURM_LOCALID -eq 0 ]; then
8
+ rm -rf /dev/shm/*
9
+ rocm-smi || true
10
+ else
11
+ sleep 2
12
+ fi
13
+
14
+ export NCCL_SOCKET_IFNAME=hsn0,hsn1,hsn2,hsn3
15
+ export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
16
+ export FI_CXI_DEFAULT_CQ_SIZE=131072
17
+
18
+ # debugging (noisy)
19
+ #export NCCL_DEBUG=INFO
20
+ #export RCCL_KERNEL_COLL_TRACE_ENABLE=1
21
+ #export NCCL_DEBUG_SUBSYS=INIT,COLL
22
+
23
+ module --quiet purge
24
+ module load cray-python
25
+
26
+ module load CrayEnv
27
+ module load PrgEnv-cray/8.3.3
28
+ module load craype-accel-amd-gfx90a
29
+ module load cray-python
30
+
31
+ module use /pfs/lustrep2/projappl/project_462000125/samantao-public/mymodules
32
+ module load suse-repo-deps/sam-default
33
+ module load rocm/sam-5.2.3.lua
34
+ module load rccl/sam-develop.lua
35
+ module load aws-ofi-rccl/sam-default.lua
36
+
37
+ source /scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/bin/activate
38
+
39
+ MASTER_NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
40
+ MASTER_PORT=9999
41
+
42
+ echo "Launching on $SLURMD_NODENAME ($SLURM_PROCID/$SLURM_JOB_NUM_NODES)," \
43
+ "master $MASTER_NODE port $MASTER_PORT," \
44
+ "GPUs $SLURM_GPUS_ON_NODE," \
45
+ "CUDA: $(python -c 'import torch; print(torch.cuda.is_available())')"
46
+
47
+ python -u -m torch.distributed.run \
48
+ --nnodes $SLURM_JOB_NUM_NODES \
49
+ --nproc_per_node $SLURM_GPUS_ON_NODE \
50
+ --node_rank=$SLURM_PROCID \
51
+ --master_addr $MASTER_NODE \
52
+ --master_port $MASTER_PORT \
53
+ "$@"
sbatch_mtf_ru.sh ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --nodes=8
3
+ #SBATCH --ntasks-per-node=1
4
+ #SBATCH --cpus-per-task=32
5
+ #SBATCH --mem=256G
6
+ #SBATCH -p eap
7
+ #SBATCH -t 48:00:00
8
+ #SBATCH --gpus-per-node=mi250:8
9
+ #SBATCH --exclusive=user
10
+ #SBATCH --hint=nomultithread
11
+ #SBATCH --account=project_462000119
12
+ #SBATCH -o logs/%j.out
13
+ #SBATCH -e logs/%j.err
14
+
15
+ # if run without sbatch, invoke here
16
+ #if [ -z $SLURM_JOB_ID ]; then
17
+ # mkdir -p logs
18
+ # sbatch "$0"
19
+ # exit
20
+ #fi
21
+
22
+ VARIANT=7b1ru
23
+
24
+ set -euo pipefail
25
+
26
+ # symlink logs/latest.out and logs/latest.err
27
+ ln -f -s $SLURM_JOB_ID.out logs/latest.out
28
+ ln -f -s $SLURM_JOB_ID.err logs/latest.err
29
+
30
+ KILL_SWITCH_PATH=kill-switch-$VARIANT
31
+ CHECKPOINT_PATH=checkpoints_$VARIANT
32
+ TENSORBOARD_PATH=tensorboard_$VARIANT
33
+
34
+ # Data
35
+ TOKENIZER_NAME_OR_PATH=bigscience/tokenizer
36
+
37
+ TRAIN_DATA_PATH=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-mtf/xp3ru_train.txt
38
+ VALID_DATA_PATH=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-mtf/xp3_validation.txt
39
+
40
+ PP_SIZE=1
41
+ TP_SIZE=1
42
+
43
+ MICRO_BATCH_SIZE=2
44
+ GRADIENT_ACCUMULATION_STEPS=4
45
+ WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES))
46
+ GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS))
47
+
48
+ # Model parameters
49
+ NLAYERS=30
50
+ NHIDDEN=4096
51
+ NHEADS=32
52
+ SEQ_LEN=2048
53
+
54
+ TRAIN_SAMPLES=6_348_800
55
+
56
+ SAVE_INTERVAL=100
57
+
58
+ ZERO_STAGE=1
59
+
60
+ mkdir -p ds_configs
61
+ config_json="ds_configs/$SLURM_JOB_ID.json"
62
+
63
+ cat <<EOT > $config_json
64
+ {
65
+ "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
66
+ "train_batch_size": $GLOBAL_BATCH_SIZE,
67
+ "gradient_clipping": 1.0,
68
+ "zero_optimization": {
69
+ "stage": $ZERO_STAGE
70
+ },
71
+ "fp16": {
72
+ "enabled": true,
73
+ "loss_scale": 0,
74
+ "loss_scale_window": 500,
75
+ "hysteresis": 2,
76
+ "min_loss_scale": 1,
77
+ "initial_scale_power": 12
78
+ },
79
+ "steps_per_print": 2000,
80
+ "wall_clock_breakdown": false
81
+ }
82
+ EOT
83
+
84
+
85
+ CMD=" \
86
+ Megatron-DeepSpeed/finetune_t0.py \
87
+ --tensor-model-parallel-size $TP_SIZE \
88
+ --pipeline-model-parallel-size $PP_SIZE \
89
+ --num-layers $NLAYERS \
90
+ --hidden-size $NHIDDEN \
91
+ --num-attention-heads $NHEADS \
92
+ --seq-length $SEQ_LEN \
93
+ --max-position-embeddings $SEQ_LEN \
94
+ --micro-batch-size $MICRO_BATCH_SIZE \
95
+ --global-batch-size $GLOBAL_BATCH_SIZE \
96
+ --train-samples $TRAIN_SAMPLES \
97
+ --tokenizer-type PretrainedFromHF \
98
+ --tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \
99
+ --init-method-std 0.0048 \
100
+ --embed-layernorm \
101
+ --fp16 \
102
+ --seed 42 \
103
+ --position-embedding-type alibi \
104
+ --abort-on-unmet-fused-kernel-constraints \
105
+ --clip-grad 1.0 \
106
+ --kill-switch-path $KILL_SWITCH_PATH \
107
+ --checkpoint-activations \
108
+ --pad-vocab-size-to 250880 \
109
+ --optimizer adam \
110
+ --adam-beta1 0.9 \
111
+ --adam-beta2 0.95 \
112
+ --adam-eps 1e-8 \
113
+ --lr 2e-5 \
114
+ --lr-decay-style constant \
115
+ --lr-warmup-samples 0 \
116
+ --clip-grad 1.0 \
117
+ --weight-decay 1e-4 \
118
+ --no-load-optim \
119
+ --reset-progress \
120
+ --norm-target-loss \
121
+ --log-interval 10 \
122
+ --save-interval $SAVE_INTERVAL \
123
+ --eval-interval 500 \
124
+ --eval-iters 1 \
125
+ --tensorboard-dir $TENSORBOARD_PATH \
126
+ --tensorboard-queue-size 5 \
127
+ --log-timers-to-tensorboard \
128
+ --log-batch-size-to-tensorboard \
129
+ --log-validation-ppl-to-tensorboard \
130
+ --save $CHECKPOINT_PATH \
131
+ --load $CHECKPOINT_PATH \
132
+ --train-weighted-split-paths-path $TRAIN_DATA_PATH \
133
+ --valid-weighted-split-paths-path $VALID_DATA_PATH \
134
+ --dataloader-type single \
135
+ --data-impl mmap \
136
+ --deepspeed \
137
+ --deepspeed_config $config_json \
138
+ --zero-stage $ZERO_STAGE \
139
+ "
140
+
141
+ echo $CMD
142
+
143
+ echo "START $SLURM_JOBID: $(date)"
144
+
145
+ srun --label launch.sh $CMD
146
+
147
+ echo "END $SLURM_JOBID: $(date)"
train_ru.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ "train: 1 0:1 /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-mtf/xp3rumegds/xp3_ru"