OpenCLIP not installed OpenCLIP not installed OpenCLIP not installedOpenCLIP not installedOpenCLIP not installedOpenCLIP not installedOpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installedOpenCLIP not installed OpenCLIP not installedOpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installedOpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installedOpenCLIP not installed OpenCLIP not installed OpenCLIP not installedOpenCLIP not installedOpenCLIP not installed OpenCLIP not installedOpenCLIP not installedOpenCLIP not installed OpenCLIP not installed OpenCLIP not installedOpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installedOpenCLIP not installed OpenCLIP not installed OpenCLIP not installedOpenCLIP not installed OpenCLIP not installedOpenCLIP not installed OpenCLIP not installed OpenCLIP not installedOpenCLIP not installed OpenCLIP not installed OpenCLIP not installedOpenCLIP not installedOpenCLIP not installedOpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installedOpenCLIP not installed OpenCLIP not installed OpenCLIP not installedOpenCLIP not installedOpenCLIP not installed OpenCLIP not installed OpenCLIP not installedOpenCLIP not installed OpenCLIP not installed OpenCLIP not installedOpenCLIP not installed OpenCLIP not installedOpenCLIP not installed OpenCLIP not installed OpenCLIP not installedOpenCLIP not installed OpenCLIP not installedOpenCLIP not installedOpenCLIP not installed OpenCLIP not installedOpenCLIP not installed OpenCLIP not installed OpenCLIP not installedOpenCLIP not installed OpenCLIP not installedOpenCLIP not installedOpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installedOpenCLIP not installed OpenCLIP not installedOpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installedOpenCLIP not installedOpenCLIP not installed OpenCLIP not installedOpenCLIP not installedOpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installed OpenCLIP not installed [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,893] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,952] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,952] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,952] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,952] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,952] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,952] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,952] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,952] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,954] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,954] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,954] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,954] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,954] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,954] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,954] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,954] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,956] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,956] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,956] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,956] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,956] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,956] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,956] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:01,957] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,009] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,009] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,009] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,009] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,009] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,009] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,009] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,009] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,020] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,020] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,020] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,020] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,020] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,020] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,020] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,020] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,024] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,024] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,024] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,024] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,024] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,024] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,024] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,024] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,033] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,033] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,033] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,033] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,033] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,033] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,033] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,033] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,037] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,037] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,037] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,037] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,037] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,037] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,037] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,037] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,088] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,088] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,088] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,088] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,088] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,088] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,089] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,089] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,132] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,132] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,132] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,132] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,132] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,132] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,132] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,132] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,144] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,144] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,144] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,144] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,144] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,144] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,145] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:02,145] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,047] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl [2025-02-17 19:21:08,047] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,048] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,048] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,048] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,048] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,048] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,048] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,048] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,048] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,050] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,050] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,050] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,050] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,050] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,050] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,050] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,050] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,052] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,052] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,052] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,052] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,052] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,052] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,052] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,052] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,052] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,052] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,053] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,053] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,053] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,053] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,053] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,053] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,056] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,056] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,056] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,056] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,056] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,056] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,056] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,056] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,056] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,056] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,056] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,056] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,056] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,056] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,056] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,056] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,057] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,057] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,057] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,057] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,057] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,057] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,057] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,057] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,057] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,057] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,057] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,057] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,057] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,057] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,058] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,058] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,058] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,058] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,058] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,058] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,058] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,058] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,058] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,058] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,060] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,060] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,060] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,060] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,061] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,061] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,061] [INFO] [comm.py:652:init_distributed] cdb=None [2025-02-17 19:21:08,061] [INFO] [comm.py:652:init_distributed] cdb=None ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=0, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-14, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=0, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-44, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=0, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-358, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=0, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-274, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=0, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-121, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, ect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=0, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-273, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, load_best_model_at_end=False, local_rank=0, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-75, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=0, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-271, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=0, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-337, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=0, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-339, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=0, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-338, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=0, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-340, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=0, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-359, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=0, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-96, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=6, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-340, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=6, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-14, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=0, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-275, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=0, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-272, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=4, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-14, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=6, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-358, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=1, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-358, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=3, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-358, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=2, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-340, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=4, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-358, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=7, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-358, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=6, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-275, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=1, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-44, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspload_best_model_at_end=False, local_rank=3, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-96, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=2, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-273, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=4, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-44, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4')DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_imageect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=7, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-96, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, _folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=7, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-14, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_asppush_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspTrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, ect_ratio='square') ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_teval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, load_best_model_at_end=False, local_rank=2, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-96, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, ype='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, local_rank=1, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-14, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, length_column_name=length, load_best_model_at_end=False, local_rank=2, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-358, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, adam_beta2=0.999, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=5, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-14, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, local_rank=5, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-358, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=2, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-14, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=3, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-14, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspquant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, ect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=6, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-273, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, load_best_model_at_end=False, local_rank=4, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-271, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=7, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-359, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=1, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-273, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, load_best_model_at_end=False, local_rank=6, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-271, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=4, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-340, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_tect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, ect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, ype='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, load_best_model_at_end=False, local_rank=5, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-273, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, load_best_model_at_end=False, local_rank=7, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-340, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, length_column_name=length, load_best_model_at_end=False, local_rank=5, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-271, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=1, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-271, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspquant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, ect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=6, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-75, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspload_best_model_at_end=False, local_rank=7, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-273, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_asppush_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ect_ratio='square') ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_tpush_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, ype='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspeval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, ect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, load_best_model_at_end=False, local_rank=7, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-44, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, length_column_name=length, load_best_model_at_end=False, local_rank=2, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-271, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_asppush_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) load_best_model_at_end=False, local_rank=1, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-340, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, ect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, load_best_model_at_end=False, local_rank=4, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-96, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, local_rank=7, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-271, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_asppush_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspload_best_model_at_end=False, local_rank=2, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-121, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspeval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, ect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, load_best_model_at_end=False, local_rank=7, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-75, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) load_best_model_at_end=False, local_rank=5, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-340, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=1, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-96, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=1, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-75, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_asplocal_rank=3, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-75, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, ect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspload_best_model_at_end=False, local_rank=7, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-275, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, ect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=5, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-75, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=2, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-44, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspload_best_model_at_end=False, local_rank=3, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-271, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, ect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=5, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-44, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspeval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=6, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-44, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, ect_ratio='square') ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_tpush_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ype='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspeval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, ect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, length_column_name=length, load_best_model_at_end=False, local_rank=3, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-273, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, push_to_hub_organization=None, load_best_model_at_end=False, local_rank=3, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-340, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=4, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-273, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=6, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-96, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=5, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-96, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=4, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-121, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=4, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-75, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=2, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-75, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=4, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-339, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=5, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-121, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=3, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-44, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=1, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-359, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, ect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=4, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-275, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, load_best_model_at_end=False, local_rank=7, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-121, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspload_best_model_at_end=False, local_rank=2, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-275, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, ect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_asppush_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, ect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspload_best_model_at_end=False, local_rank=3, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-359, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, ect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) load_best_model_at_end=False, local_rank=4, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-337, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_asppush_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) load_best_model_at_end=False, local_rank=3, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-275, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, ect_ratio='square')ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_tpush_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ype='mix', image_aspect_ratio='square') ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=6, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-359, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=2, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-359, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, )TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=4, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-359, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspdataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, ect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=5, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-359, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, load_best_model_at_end=False, local_rank=3, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-337, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=3, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-121, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=1, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-121, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_asppush_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ect_ratio='square') ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_tModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspype='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, ect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, length_column_name=length, load_best_model_at_end=False, local_rank=1, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-272, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, load_best_model_at_end=False, local_rank=6, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-121, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=2, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-272, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspload_best_model_at_end=False, local_rank=7, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-339, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, ect_ratio='square') ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_tpush_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ype='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=1, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-275, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=5, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-275, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=7, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-274, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=6, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-338, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=1, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-337, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=2, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-337, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, ect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, ect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, load_best_model_at_end=False, local_rank=7, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-272, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) load_best_model_at_end=False, local_rank=2, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-339, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=7, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-337, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=1, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-339, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=5, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-337, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_tect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, ype='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, load_best_model_at_end=False, local_rank=3, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-274, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, length_column_name=length, load_best_model_at_end=False, local_rank=3, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-339, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=5, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-339, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=3, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-272, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=5, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-272, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=6, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-272, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=6, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-337, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=2, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-338, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_asppush_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=6, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-339, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=1, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-274, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspload_best_model_at_end=False, local_rank=5, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-338, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, ect_ratio='square') ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_tpush_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ype='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=4, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-274, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=5, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-274, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=6, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-274, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=4, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-272, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=2, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-274, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=4, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-338, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=7, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-338, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=1, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-338, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) ModelArguments(model_name_or_path='Qwen/Qwen2.5-VL-7B-Instruct', version='qwen', freeze_backbone=True, tune_mm_mlp_adapter=False, vision_tower=None, gen_vision_tower='eva-clip-E-14-plus', mm_vision_select_layer=-2, pretrain_mm_mlp_adapter=None, pretrain_gen_mlp_adapter=None, vision_tower_pretrained=None, mm_projector_type='mlp2x_gelu', gen_projector_type='mlp2x_gelu', mm_use_im_start_end=False, mm_use_im_patch_token=False, mm_patch_merge_type='flat', mm_vision_select_feature='patch', n_query=64, n_und_query=729, gen_pooling='early_pool2d_4') DataArguments(data_path='/fsx_0/user/zhaojiang/data/ShareGPT4V/pixelporse_sharegpt4v_text_image_both.json', lazy_preprocess=True, is_multimodal=False, image_folder='/fsx_0/user/zhaojiang/data/LLaVA-Instruct-150K', pixelprose_image_folder='/fsx_0/user/zhaojiang/models/hub/datasets--tomg-group-umd--pixelprose-shards/snapshots/36facc0ec7ff5ee9bdde1c2e217b3d7999b58411', datacomp_shortcaption_image_folder=None, datacomp_longcaption_image_folder=None, data_type='mix', image_aspect_ratio='square') TrainingArguments( _n_gpu=1, accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, average_tokens_across_devices=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, bits=16, cache_dir=None, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=./scripts/zero1.json, disable_tqdm=False, dispatch_batches=None, do_eval=False, do_predict=False, do_train=False, double_quant=True, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=None, eval_strategy=no, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, freeze_mm_mlp_adapter=False, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=None, group_by_length=False, group_by_modality_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=zhaojiang/llava-clip-text-image-16-nodes, hub_private_repo=None, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=3, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen/runs/Feb17_19-21-08_h100-st-p548xlarge-338, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=1.0, logging_strategy=steps, lora_alpha=16, lora_bias=none, lora_dropout=0.05, lora_enable=False, lora_r=64, lora_weight_path=, lr_scheduler_kwargs={}, lr_scheduler_type=constant_with_warmup, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mm_projector_lr=None, model_max_length=2048, mp_parameters=, mpt_attn_impl=triton, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=3.0, optim=adamw_torch, optim_args=None, optim_target_modules=None, output_dir=/fsx_0/user/zhaojiang/models/qwen-vl-gen, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=4, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, quant_type=nf4, ray_scope=last, remove_unused_columns=False, report_to=['wandb'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=qwen-vl-diff-clip-16-nodes_early_pool2d_4, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=steps, save_total_limit=1, seed=42, skip_memory_metrics=True, split_batches=None, tf32=True, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.01, warmup_steps=0, weight_decay=0.0, ) Using conversation format: qwen Using conversation format: qwen Using conversation format: qwen Using conversation format: qwen Using conversation format: qwen Using conversation format: qwen Using conversation format: qwen Using conversation format: qwen Using conversation format: qwen Using conversation format: qwen Using conversation format: qwen Using conversation format: qwen Using conversation format: qwen Using conversation format: qwen Using conversation format: qwen Using conversation format: qwen Loading EVA ViT: eva-clip-E-14-plus Loading EVA ViT: eva-clip-E-14-plus Pretrained: NoneLoading EVA ViT: eva-clip-E-14-plus Loading EVA ViT: eva-clip-E-14-plusLoading EVA ViT: eva-clip-E-14-plus Loading EVA ViT: eva-clip-E-14-plusPretrained: None Pretrained: None Pretrained: NonePretrained: None Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Loading EVA ViT: eva-clip-E-14-plusPretrained: None Loading EVA ViT: eva-clip-E-14-plusLoading EVA ViT: eva-clip-E-14-plusPretrained: NoneLoading EVA ViT: eva-clip-E-14-plus Pretrained: NonePretrained: None Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plusLoading EVA ViT: eva-clip-E-14-plusLoading EVA ViT: eva-clip-E-14-plus Pretrained: NonePretrained: None Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plusLoading EVA ViT: eva-clip-E-14-plusLoading EVA ViT: eva-clip-E-14-plus Loading EVA ViT: eva-clip-E-14-plusLoading EVA ViT: eva-clip-E-14-plus Pretrained: None Pretrained: NonePretrained: None Pretrained: None Pretrained: None Loading EVA ViT: eva-clip-E-14-plusLoading EVA ViT: eva-clip-E-14-plusLoading EVA ViT: eva-clip-E-14-plus Pretrained: NonePretrained: NoneLoading EVA ViT: eva-clip-E-14-plus Pretrained: None Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Loading EVA ViT: eva-clip-E-14-plusLoading EVA ViT: eva-clip-E-14-plus Loading EVA ViT: eva-clip-E-14-plusPretrained: None Pretrained: None Pretrained: None Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: NoneLoading EVA ViT: eva-clip-E-14-plus Loading EVA ViT: eva-clip-E-14-plusLoading EVA ViT: eva-clip-E-14-plus Loading EVA ViT: eva-clip-E-14-plus Loading EVA ViT: eva-clip-E-14-plus Pretrained: NonePretrained: None Pretrained: None Pretrained: None Pretrained: None Loading EVA ViT: eva-clip-E-14-plusLoading EVA ViT: eva-clip-E-14-plus Pretrained: NoneLoading EVA ViT: eva-clip-E-14-plus Pretrained: None Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Loading EVA ViT: eva-clip-E-14-plusPretrained: NoneLoading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plusPretrained: None Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plusLoading EVA ViT: eva-clip-E-14-plus Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plusPretrained: None Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plusLoading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plusLoading EVA ViT: eva-clip-E-14-plusPretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Pretrained: None Loading EVA ViT: eva-clip-E-14-plusPretrained: None Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plusLoading EVA ViT: eva-clip-E-14-plus Loading EVA ViT: eva-clip-E-14-plus Loading EVA ViT: eva-clip-E-14-plusLoading EVA ViT: eva-clip-E-14-plusPretrained: NonePretrained: NonePretrained: NoneLoading EVA ViT: eva-clip-E-14-plus Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Pretrained: None Pretrained: None Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plusLoading EVA ViT: eva-clip-E-14-plus Pretrained: None Pretrained: NoneLoading EVA ViT: eva-clip-E-14-plus Loading EVA ViT: eva-clip-E-14-plus Pretrained: NonePretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plusLoading EVA ViT: eva-clip-E-14-plusLoading EVA ViT: eva-clip-E-14-plusLoading EVA ViT: eva-clip-E-14-plus Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Pretrained: None Pretrained: None Pretrained: NonePretrained: NoneLoading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plusLoading EVA ViT: eva-clip-E-14-plus Pretrained: NoneLoading EVA ViT: eva-clip-E-14-plus Loading EVA ViT: eva-clip-E-14-plusPretrained: None Pretrained: None Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None Loading EVA ViT: eva-clip-E-14-plus Pretrained: None EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! EVA-CLIP incompatible_keys: Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: Loaded image processor: Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! random initiation the down_projector !!! random initiation the latent_queries !!! random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! random initiation the down_projector !!! random initiation the latent_queries !!! EVA-CLIP incompatible_keys: Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! random initiation the down_projector !!! random initiation the latent_queries !!! EVA-CLIP incompatible_keys: EVA-CLIP incompatible_keys: Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! EVA-CLIP incompatible_keys: Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! EVA-CLIP incompatible_keys: Loaded image processor: Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! random initiation the down_projector !!! random initiation the latent_queries !!! EVA-CLIP incompatible_keys: Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! EVA-CLIP incompatible_keys: Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! EVA-CLIP incompatible_keys: Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Loaded image processor: random initiation the down_projector !!! random initiation the latent_queries !!! Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Total parameters: 12868467523 Trainable parameters: 1365239712 Totoal number of training instance: 194420624 Totoal number of training instance: 194420624 Totoal number of training instance: 194420624 Totoal number of training instance: 194420624 Totoal number of training instance: 194420624 Totoal number of training instance: 194420624 Totoal number of training instance: 194420624 Totoal number of training instance: 194420624 Totoal number of training instance: 194420624 Totoal number of training instance: 194420624 idx name shape trainable ----- -------------------------------------------------------------------- -------------------------------- ----------- 0 visual.patch_embed.proj.weight torch.Size([1280, 3, 2, 14, 14]) False 1 visual.blocks.0.norm1.weight torch.Size([1280]) False 2 visual.blocks.0.norm2.weight torch.Size([1280]) False 3 visual.blocks.0.attn.qkv.weight torch.Size([3840, 1280]) False 4 visual.blocks.0.attn.qkv.bias torch.Size([3840]) False 5 visual.blocks.0.attn.proj.weight torch.Size([1280, 1280]) False 6 visual.blocks.0.attn.proj.bias torch.Size([1280]) False 7 visual.blocks.0.mlp.gate_proj.weight torch.Size([3420, 1280]) False 8 visual.blocks.0.mlp.gate_proj.bias torch.Size([3420]) False 9 visual.blocks.0.mlp.up_proj.weight torch.Size([3420, 1280]) False 10 visual.blocks.0.mlp.up_proj.bias torch.Size([3420]) False 11 visual.blocks.0.mlp.down_proj.weight torch.Size([1280, 3420]) False 12 visual.blocks.0.mlp.down_proj.bias torch.Size([1280]) False 13 visual.blocks.1.norm1.weight torch.Size([1280]) False 14 visual.blocks.1.norm2.weight torch.Size([1280]) False 15 visual.blocks.1.attn.qkv.weight torch.Size([3840, 1280]) False 16 visual.blocks.1.attn.qkv.bias torch.Size([3840]) False 17 visual.blocks.1.attn.proj.weight torch.Size([1280, 1280]) False 18 visual.blocks.1.attn.proj.bias torch.Size([1280]) False 19 visual.blocks.1.mlp.gate_proj.weight torch.Size([3420, 1280]) False 20 visual.blocks.1.mlp.gate_proj.bias torch.Size([3420]) False 21 visual.blocks.1.mlp.up_proj.weight torch.Size([3420, 1280]) False 22 visual.blocks.1.mlp.up_proj.bias torch.Size([3420]) False 23 visual.blocks.1.mlp.down_proj.weight torch.Size([1280, 3420]) False 24 visual.blocks.1.mlp.down_proj.bias torch.Size([1280]) False 25 visual.blocks.2.norm1.weight torch.Size([1280]) False 26 visual.blocks.2.norm2.weight torch.Size([1280]) False 27 visual.blocks.2.attn.qkv.weight torch.Size([3840, 1280]) False 28 visual.blocks.2.attn.qkv.bias torch.Size([3840]) False 29 visual.blocks.2.attn.proj.weight torch.Size([1280, 1280]) False 30 visual.blocks.2.attn.proj.bias torch.Size([1280]) False 31 visual.blocks.2.mlp.gate_proj.weight torch.Size([3420, 1280]) False 32 visual.blocks.2.mlp.gate_proj.bias torch.Size([3420]) False 33 visual.blocks.2.mlp.up_proj.weight torch.Size([3420, 1280]) False 34 visual.blocks.2.mlp.up_proj.bias torch.Size([3420]) False 35 visual.blocks.2.mlp.down_proj.weight torch.Size([1280, 3420]) False 36 visual.blocks.2.mlp.down_proj.bias torch.Size([1280]) False 37 visual.blocks.3.norm1.weight torch.Size([1280]) False 38 visual.blocks.3.norm2.weight torch.Size([1280]) False 39 visual.blocks.3.attn.qkv.weight torch.Size([3840, 1280]) False 40 visual.blocks.3.attn.qkv.bias torch.Size([3840]) False 41 visual.blocks.3.attn.proj.weight torch.Size([1280, 1280]) False 42 visual.blocks.3.attn.proj.bias torch.Size([1280]) False 43 visual.blocks.3.mlp.gate_proj.weight torch.Size([3420, 1280]) False 44 visual.blocks.3.mlp.gate_proj.bias torch.Size([3420]) False 45 visual.blocks.3.mlp.up_proj.weight torch.Size([3420, 1280]) False 46 visual.blocks.3.mlp.up_proj.bias torch.Size([3420]) False 47 visual.blocks.3.mlp.down_proj.weight torch.Size([1280, 3420]) False 48 visual.blocks.3.mlp.down_proj.bias torch.Size([1280]) False 49 visual.blocks.4.norm1.weight torch.Size([1280]) False 50 visual.blocks.4.norm2.weight torch.Size([1280]) False 51 visual.blocks.4.attn.qkv.weight torch.Size([3840, 1280]) False 52 visual.blocks.4.attn.qkv.bias torch.Size([3840]) False 53 visual.blocks.4.attn.proj.weight torch.Size([1280, 1280]) False 54 visual.blocks.4.attn.proj.bias torch.Size([1280]) False 55 visual.blocks.4.mlp.gate_proj.weight torch.Size([3420, 1280]) False 56 visual.blocks.4.mlp.gate_proj.bias torch.Size([3420]) False 57 visual.blocks.4.mlp.up_proj.weight torch.Size([3420, 1280]) False 58 visual.blocks.4.mlp.up_proj.bias torch.Size([3420]) False 59 visual.blocks.4.mlp.down_proj.weight torch.Size([1280, 3420]) False 60 visual.blocks.4.mlp.down_proj.bias torch.Size([1280]) False 61 visual.blocks.5.norm1.weight torch.Size([1280]) False 62 visual.blocks.5.norm2.weight torch.Size([1280]) False 63 visual.blocks.5.attn.qkv.weight torch.Size([3840, 1280]) False 64 visual.blocks.5.attn.qkv.bias torch.Size([3840]) False 65 visual.blocks.5.attn.proj.weight torch.Size([1280, 1280]) False 66 visual.blocks.5.attn.proj.bias torch.Size([1280]) False 67 visual.blocks.5.mlp.gate_proj.weight torch.Size([3420, 1280]) False 68 visual.blocks.5.mlp.gate_proj.bias torch.Size([3420]) False 69 visual.blocks.5.mlp.up_proj.weight torch.Size([3420, 1280]) False 70 visual.blocks.5.mlp.up_proj.bias torch.Size([3420]) False 71 visual.blocks.5.mlp.down_proj.weight torch.Size([1280, 3420]) False 72 visual.blocks.5.mlp.down_proj.bias torch.Size([1280]) False 73 visual.blocks.6.norm1.weight torch.Size([1280]) False 74 visual.blocks.6.norm2.weight torch.Size([1280]) False 75 visual.blocks.6.attn.qkv.weight torch.Size([3840, 1280]) False 76 visual.blocks.6.attn.qkv.bias torch.Size([3840]) False 77 visual.blocks.6.attn.proj.weight torch.Size([1280, 1280]) False 78 visual.blocks.6.attn.proj.bias torch.Size([1280]) False 79 visual.blocks.6.mlp.gate_proj.weight torch.Size([3420, 1280]) False 80 visual.blocks.6.mlp.gate_proj.bias torch.Size([3420]) False 81 visual.blocks.6.mlp.up_proj.weight torch.Size([3420, 1280]) False 82 visual.blocks.6.mlp.up_proj.bias torch.Size([3420]) False 83 visual.blocks.6.mlp.down_proj.weight torch.Size([1280, 3420]) False 84 visual.blocks.6.mlp.down_proj.bias torch.Size([1280]) False 85 visual.blocks.7.norm1.weight torch.Size([1280]) False 86 visual.blocks.7.norm2.weight torch.Size([1280]) False 87 visual.blocks.7.attn.qkv.weight torch.Size([3840, 1280]) False 88 visual.blocks.7.attn.qkv.bias torch.Size([3840]) False 89 visual.blocks.7.attn.proj.weight torch.Size([1280, 1280]) False 90 visual.blocks.7.attn.proj.bias torch.Size([1280]) False 91 visual.blocks.7.mlp.gate_proj.weight torch.Size([3420, 1280]) False 92 visual.blocks.7.mlp.gate_proj.bias torch.Size([3420]) False 93 visual.blocks.7.mlp.up_proj.weight torch.Size([3420, 1280]) False 94 visual.blocks.7.mlp.up_proj.bias torch.Size([3420]) False 95 visual.blocks.7.mlp.down_proj.weight torch.Size([1280, 3420]) False 96 visual.blocks.7.mlp.down_proj.bias torch.Size([1280]) False 97 visual.blocks.8.norm1.weight torch.Size([1280]) False 98 visual.blocks.8.norm2.weight torch.Size([1280]) False 99 visual.blocks.8.attn.qkv.weight torch.Size([3840, 1280]) False 100 visual.blocks.8.attn.qkv.bias torch.Size([3840]) False 101 visual.blocks.8.attn.proj.weight torch.Size([1280, 1280]) False 102 visual.blocks.8.attn.proj.bias torch.Size([1280]) False 103 visual.blocks.8.mlp.gate_proj.weight torch.Size([3420, 1280]) False 104 visual.blocks.8.mlp.gate_proj.bias torch.Size([3420]) False 105 visual.blocks.8.mlp.up_proj.weight torch.Size([3420, 1280]) False 106 visual.blocks.8.mlp.up_proj.bias torch.Size([3420]) False 107 visual.blocks.8.mlp.down_proj.weight torch.Size([1280, 3420]) False 108 visual.blocks.8.mlp.down_proj.bias torch.Size([1280]) False 109 visual.blocks.9.norm1.weight torch.Size([1280]) False 110 visual.blocks.9.norm2.weight torch.Size([1280]) False 111 visual.blocks.9.attn.qkv.weight torch.Size([3840, 1280]) False 112 visual.blocks.9.attn.qkv.bias torch.Size([3840]) False 113 visual.blocks.9.attn.proj.weight torch.Size([1280, 1280]) False 114 visual.blocks.9.attn.proj.bias torch.Size([1280]) False 115 visual.blocks.9.mlp.gate_proj.weight torch.Size([3420, 1280]) False 116 visual.blocks.9.mlp.gate_proj.bias torch.Size([3420]) False 117 visual.blocks.9.mlp.up_proj.weight torch.Size([3420, 1280]) False 118 visual.blocks.9.mlp.up_proj.bias torch.Size([3420]) False 119 visual.blocks.9.mlp.down_proj.weight torch.Size([1280, 3420]) False 120 visual.blocks.9.mlp.down_proj.bias torch.Size([1280]) False 121 visual.blocks.10.norm1.weight torch.Size([1280]) False 122 visual.blocks.10.norm2.weight torch.Size([1280]) False 123 visual.blocks.10.attn.qkv.weight torch.Size([3840, 1280]) False 124 visual.blocks.10.attn.qkv.bias torch.Size([3840]) False 125 visual.blocks.10.attn.proj.weight torch.Size([1280, 1280]) False 126 visual.blocks.10.attn.proj.bias torch.Size([1280]) False 127 visual.blocks.10.mlp.gate_proj.weight torch.Size([3420, 1280]) False 128 visual.blocks.10.mlp.gate_proj.bias torch.Size([3420]) False 129 visual.blocks.10.mlp.up_proj.weight torch.Size([3420, 1280]) False 130 visual.blocks.10.mlp.up_proj.bias torch.Size([3420]) False 131 visual.blocks.10.mlp.down_proj.weight torch.Size([1280, 3420]) False 132 visual.blocks.10.mlp.down_proj.bias torch.Size([1280]) False 133 visual.blocks.11.norm1.weight torch.Size([1280]) False 134 visual.blocks.11.norm2.weight torch.Size([1280]) False 135 visual.blocks.11.attn.qkv.weight torch.Size([3840, 1280]) False 136 visual.blocks.11.attn.qkv.bias torch.Size([3840]) False 137 visual.blocks.11.attn.proj.weight torch.Size([1280, 1280]) False 138 visual.blocks.11.attn.proj.bias torch.Size([1280]) False 139 visual.blocks.11.mlp.gate_proj.weight torch.Size([3420, 1280]) False 140 visual.blocks.11.mlp.gate_proj.bias torch.Size([3420]) False 141 visual.blocks.11.mlp.up_proj.weight torch.Size([3420, 1280]) False 142 visual.blocks.11.mlp.up_proj.bias torch.Size([3420]) False 143 visual.blocks.11.mlp.down_proj.weight torch.Size([1280, 3420]) False 144 visual.blocks.11.mlp.down_proj.bias torch.Size([1280]) False 145 visual.blocks.12.norm1.weight torch.Size([1280]) False 146 visual.blocks.12.norm2.weight torch.Size([1280]) False 147 visual.blocks.12.attn.qkv.weight torch.Size([3840, 1280]) False 148 visual.blocks.12.attn.qkv.bias torch.Size([3840]) False 149 visual.blocks.12.attn.proj.weight torch.Size([1280, 1280]) False 150 visual.blocks.12.attn.proj.bias torch.Size([1280]) False 151 visual.blocks.12.mlp.gate_proj.weight torch.Size([3420, 1280]) False 152 visual.blocks.12.mlp.gate_proj.bias torch.Size([3420]) False 153 visual.blocks.12.mlp.up_proj.weight torch.Size([3420, 1280]) False 154 visual.blocks.12.mlp.up_proj.bias torch.Size([3420]) False 155 visual.blocks.12.mlp.down_proj.weight torch.Size([1280, 3420]) False 156 visual.blocks.12.mlp.down_proj.bias torch.Size([1280]) False 157 visual.blocks.13.norm1.weight torch.Size([1280]) False 158 visual.blocks.13.norm2.weight torch.Size([1280]) False 159 visual.blocks.13.attn.qkv.weight torch.Size([3840, 1280]) False 160 visual.blocks.13.attn.qkv.bias torch.Size([3840]) False 161 visual.blocks.13.attn.proj.weight torch.Size([1280, 1280]) False 162 visual.blocks.13.attn.proj.bias torch.Size([1280]) False 163 visual.blocks.13.mlp.gate_proj.weight torch.Size([3420, 1280]) False 164 visual.blocks.13.mlp.gate_proj.bias torch.Size([3420]) False 165 visual.blocks.13.mlp.up_proj.weight torch.Size([3420, 1280]) False 166 visual.blocks.13.mlp.up_proj.bias torch.Size([3420]) False 167 visual.blocks.13.mlp.down_proj.weight torch.Size([1280, 3420]) False 168 visual.blocks.13.mlp.down_proj.bias torch.Size([1280]) False 169 visual.blocks.14.norm1.weight torch.Size([1280]) False 170 visual.blocks.14.norm2.weight torch.Size([1280]) False 171 visual.blocks.14.attn.qkv.weight torch.Size([3840, 1280]) False 172 visual.blocks.14.attn.qkv.bias torch.Size([3840]) False 173 visual.blocks.14.attn.proj.weight torch.Size([1280, 1280]) False 174 visual.blocks.14.attn.proj.bias torch.Size([1280]) False 175 visual.blocks.14.mlp.gate_proj.weight torch.Size([3420, 1280]) False 176 visual.blocks.14.mlp.gate_proj.bias torch.Size([3420]) False 177 visual.blocks.14.mlp.up_proj.weight torch.Size([3420, 1280]) False 178 visual.blocks.14.mlp.up_proj.bias torch.Size([3420]) False 179 visual.blocks.14.mlp.down_proj.weight torch.Size([1280, 3420]) False 180 visual.blocks.14.mlp.down_proj.bias torch.Size([1280]) False 181 visual.blocks.15.norm1.weight torch.Size([1280]) False 182 visual.blocks.15.norm2.weight torch.Size([1280]) False 183 visual.blocks.15.attn.qkv.weight torch.Size([3840, 1280]) False 184 visual.blocks.15.attn.qkv.bias torch.Size([3840]) False 185 visual.blocks.15.attn.proj.weight torch.Size([1280, 1280]) False 186 visual.blocks.15.attn.proj.bias torch.Size([1280]) False 187 visual.blocks.15.mlp.gate_proj.weight torch.Size([3420, 1280]) False 188 visual.blocks.15.mlp.gate_proj.bias torch.Size([3420]) False 189 visual.blocks.15.mlp.up_proj.weight torch.Size([3420, 1280]) False 190 visual.blocks.15.mlp.up_proj.bias torch.Size([3420]) False 191 visual.blocks.15.mlp.down_proj.weight torch.Size([1280, 3420]) False 192 visual.blocks.15.mlp.down_proj.bias torch.Size([1280]) False 193 visual.blocks.16.norm1.weight torch.Size([1280]) False 194 visual.blocks.16.norm2.weight torch.Size([1280]) False 195 visual.blocks.16.attn.qkv.weight torch.Size([3840, 1280]) False 196 visual.blocks.16.attn.qkv.bias torch.Size([3840]) False 197 visual.blocks.16.attn.proj.weight torch.Size([1280, 1280]) False 198 visual.blocks.16.attn.proj.bias torch.Size([1280]) False 199 visual.blocks.16.mlp.gate_proj.weight torch.Size([3420, 1280]) False 200 visual.blocks.16.mlp.gate_proj.bias torch.Size([3420]) False 201 visual.blocks.16.mlp.up_proj.weight torch.Size([3420, 1280]) False 202 visual.blocks.16.mlp.up_proj.bias torch.Size([3420]) False 203 visual.blocks.16.mlp.down_proj.weight torch.Size([1280, 3420]) False 204 visual.blocks.16.mlp.down_proj.bias torch.Size([1280]) False 205 visual.blocks.17.norm1.weight torch.Size([1280]) False 206 visual.blocks.17.norm2.weight torch.Size([1280]) False 207 visual.blocks.17.attn.qkv.weight torch.Size([3840, 1280]) False 208 visual.blocks.17.attn.qkv.bias torch.Size([3840]) False 209 visual.blocks.17.attn.proj.weight torch.Size([1280, 1280]) False 210 visual.blocks.17.attn.proj.bias torch.Size([1280]) False 211 visual.blocks.17.mlp.gate_proj.weight torch.Size([3420, 1280]) False 212 visual.blocks.17.mlp.gate_proj.bias torch.Size([3420]) False 213 visual.blocks.17.mlp.up_proj.weight torch.Size([3420, 1280]) False 214 visual.blocks.17.mlp.up_proj.bias torch.Size([3420]) False 215 visual.blocks.17.mlp.down_proj.weight torch.Size([1280, 3420]) False 216 visual.blocks.17.mlp.down_proj.bias torch.Size([1280]) False 217 visual.blocks.18.norm1.weight torch.Size([1280]) False 218 visual.blocks.18.norm2.weight torch.Size([1280]) False 219 visual.blocks.18.attn.qkv.weight torch.Size([3840, 1280]) False 220 visual.blocks.18.attn.qkv.bias torch.Size([3840]) False 221 visual.blocks.18.attn.proj.weight torch.Size([1280, 1280]) False 222 visual.blocks.18.attn.proj.bias torch.Size([1280]) False 223 visual.blocks.18.mlp.gate_proj.weight torch.Size([3420, 1280]) False 224 visual.blocks.18.mlp.gate_proj.bias torch.Size([3420]) False 225 visual.blocks.18.mlp.up_proj.weight torch.Size([3420, 1280]) False 226 visual.blocks.18.mlp.up_proj.bias torch.Size([3420]) False 227 visual.blocks.18.mlp.down_proj.weight torch.Size([1280, 3420]) False 228 visual.blocks.18.mlp.down_proj.bias torch.Size([1280]) False 229 visual.blocks.19.norm1.weight torch.Size([1280]) False 230 visual.blocks.19.norm2.weight torch.Size([1280]) False 231 visual.blocks.19.attn.qkv.weight torch.Size([3840, 1280]) False 232 visual.blocks.19.attn.qkv.bias torch.Size([3840]) False 233 visual.blocks.19.attn.proj.weight torch.Size([1280, 1280]) False 234 visual.blocks.19.attn.proj.bias torch.Size([1280]) False 235 visual.blocks.19.mlp.gate_proj.weight torch.Size([3420, 1280]) False 236 visual.blocks.19.mlp.gate_proj.bias torch.Size([3420]) False 237 visual.blocks.19.mlp.up_proj.weight torch.Size([3420, 1280]) False 238 visual.blocks.19.mlp.up_proj.bias torch.Size([3420]) False 239 visual.blocks.19.mlp.down_proj.weight torch.Size([1280, 3420]) False 240 visual.blocks.19.mlp.down_proj.bias torch.Size([1280]) False 241 visual.blocks.20.norm1.weight torch.Size([1280]) False 242 visual.blocks.20.norm2.weight torch.Size([1280]) False 243 visual.blocks.20.attn.qkv.weight torch.Size([3840, 1280]) False 244 visual.blocks.20.attn.qkv.bias torch.Size([3840]) False 245 visual.blocks.20.attn.proj.weight torch.Size([1280, 1280]) False 246 visual.blocks.20.attn.proj.bias torch.Size([1280]) False 247 visual.blocks.20.mlp.gate_proj.weight torch.Size([3420, 1280]) False 248 visual.blocks.20.mlp.gate_proj.bias torch.Size([3420]) False 249 visual.blocks.20.mlp.up_proj.weight torch.Size([3420, 1280]) False 250 visual.blocks.20.mlp.up_proj.bias torch.Size([3420]) False 251 visual.blocks.20.mlp.down_proj.weight torch.Size([1280, 3420]) False 252 visual.blocks.20.mlp.down_proj.bias torch.Size([1280]) False 253 visual.blocks.21.norm1.weight torch.Size([1280]) False 254 visual.blocks.21.norm2.weight torch.Size([1280]) False 255 visual.blocks.21.attn.qkv.weight torch.Size([3840, 1280]) False 256 visual.blocks.21.attn.qkv.bias torch.Size([3840]) False 257 visual.blocks.21.attn.proj.weight torch.Size([1280, 1280]) False 258 visual.blocks.21.attn.proj.bias torch.Size([1280]) False 259 visual.blocks.21.mlp.gate_proj.weight torch.Size([3420, 1280]) False 260 visual.blocks.21.mlp.gate_proj.bias torch.Size([3420]) False 261 visual.blocks.21.mlp.up_proj.weight torch.Size([3420, 1280]) False 262 visual.blocks.21.mlp.up_proj.bias torch.Size([3420]) False 263 visual.blocks.21.mlp.down_proj.weight torch.Size([1280, 3420]) False 264 visual.blocks.21.mlp.down_proj.bias torch.Size([1280]) False 265 visual.blocks.22.norm1.weight torch.Size([1280]) False 266 visual.blocks.22.norm2.weight torch.Size([1280]) False 267 visual.blocks.22.attn.qkv.weight torch.Size([3840, 1280]) False 268 visual.blocks.22.attn.qkv.bias torch.Size([3840]) False 269 visual.blocks.22.attn.proj.weight torch.Size([1280, 1280]) False 270 visual.blocks.22.attn.proj.bias torch.Size([1280]) False 271 visual.blocks.22.mlp.gate_proj.weight torch.Size([3420, 1280]) False 272 visual.blocks.22.mlp.gate_proj.bias torch.Size([3420]) False 273 visual.blocks.22.mlp.up_proj.weight torch.Size([3420, 1280]) False 274 visual.blocks.22.mlp.up_proj.bias torch.Size([3420]) False 275 visual.blocks.22.mlp.down_proj.weight torch.Size([1280, 3420]) False 276 visual.blocks.22.mlp.down_proj.bias torch.Size([1280]) False 277 visual.blocks.23.norm1.weight torch.Size([1280]) False 278 visual.blocks.23.norm2.weight torch.Size([1280]) False 279 visual.blocks.23.attn.qkv.weight torch.Size([3840, 1280]) False 280 visual.blocks.23.attn.qkv.bias torch.Size([3840]) False 281 visual.blocks.23.attn.proj.weight torch.Size([1280, 1280]) False 282 visual.blocks.23.attn.proj.bias torch.Size([1280]) False 283 visual.blocks.23.mlp.gate_proj.weight torch.Size([3420, 1280]) False 284 visual.blocks.23.mlp.gate_proj.bias torch.Size([3420]) False 285 visual.blocks.23.mlp.up_proj.weight torch.Size([3420, 1280]) False 286 visual.blocks.23.mlp.up_proj.bias torch.Size([3420]) False 287 visual.blocks.23.mlp.down_proj.weight torch.Size([1280, 3420]) False 288 visual.blocks.23.mlp.down_proj.bias torch.Size([1280]) False 289 visual.blocks.24.norm1.weight torch.Size([1280]) False 290 visual.blocks.24.norm2.weight torch.Size([1280]) False 291 visual.blocks.24.attn.qkv.weight torch.Size([3840, 1280]) False 292 visual.blocks.24.attn.qkv.bias torch.Size([3840]) False 293 visual.blocks.24.attn.proj.weight torch.Size([1280, 1280]) False 294 visual.blocks.24.attn.proj.bias torch.Size([1280]) False 295 visual.blocks.24.mlp.gate_proj.weight torch.Size([3420, 1280]) False 296 visual.blocks.24.mlp.gate_proj.bias torch.Size([3420]) False 297 visual.blocks.24.mlp.up_proj.weight torch.Size([3420, 1280]) False 298 visual.blocks.24.mlp.up_proj.bias torch.Size([3420]) False 299 visual.blocks.24.mlp.down_proj.weight torch.Size([1280, 3420]) False 300 visual.blocks.24.mlp.down_proj.bias torch.Size([1280]) False 301 visual.blocks.25.norm1.weight torch.Size([1280]) False 302 visual.blocks.25.norm2.weight torch.Size([1280]) False 303 visual.blocks.25.attn.qkv.weight torch.Size([3840, 1280]) False 304 visual.blocks.25.attn.qkv.bias torch.Size([3840]) False 305 visual.blocks.25.attn.proj.weight torch.Size([1280, 1280]) False 306 visual.blocks.25.attn.proj.bias torch.Size([1280]) False 307 visual.blocks.25.mlp.gate_proj.weight torch.Size([3420, 1280]) False 308 visual.blocks.25.mlp.gate_proj.bias torch.Size([3420]) False 309 visual.blocks.25.mlp.up_proj.weight torch.Size([3420, 1280]) False 310 visual.blocks.25.mlp.up_proj.bias torch.Size([3420]) False 311 visual.blocks.25.mlp.down_proj.weight torch.Size([1280, 3420]) False 312 visual.blocks.25.mlp.down_proj.bias torch.Size([1280]) False 313 visual.blocks.26.norm1.weight torch.Size([1280]) False 314 visual.blocks.26.norm2.weight torch.Size([1280]) False 315 visual.blocks.26.attn.qkv.weight torch.Size([3840, 1280]) False 316 visual.blocks.26.attn.qkv.bias torch.Size([3840]) False 317 visual.blocks.26.attn.proj.weight torch.Size([1280, 1280]) False 318 visual.blocks.26.attn.proj.bias torch.Size([1280]) False 319 visual.blocks.26.mlp.gate_proj.weight torch.Size([3420, 1280]) False 320 visual.blocks.26.mlp.gate_proj.bias torch.Size([3420]) False 321 visual.blocks.26.mlp.up_proj.weight torch.Size([3420, 1280]) False 322 visual.blocks.26.mlp.up_proj.bias torch.Size([3420]) False 323 visual.blocks.26.mlp.down_proj.weight torch.Size([1280, 3420]) False 324 visual.blocks.26.mlp.down_proj.bias torch.Size([1280]) False 325 visual.blocks.27.norm1.weight torch.Size([1280]) False 326 visual.blocks.27.norm2.weight torch.Size([1280]) False 327 visual.blocks.27.attn.qkv.weight torch.Size([3840, 1280]) False 328 visual.blocks.27.attn.qkv.bias torch.Size([3840]) False 329 visual.blocks.27.attn.proj.weight torch.Size([1280, 1280]) False 330 visual.blocks.27.attn.proj.bias torch.Size([1280]) False 331 visual.blocks.27.mlp.gate_proj.weight torch.Size([3420, 1280]) False 332 visual.blocks.27.mlp.gate_proj.bias torch.Size([3420]) False 333 visual.blocks.27.mlp.up_proj.weight torch.Size([3420, 1280]) False 334 visual.blocks.27.mlp.up_proj.bias torch.Size([3420]) False 335 visual.blocks.27.mlp.down_proj.weight torch.Size([1280, 3420]) False 336 visual.blocks.27.mlp.down_proj.bias torch.Size([1280]) False 337 visual.blocks.28.norm1.weight torch.Size([1280]) False 338 visual.blocks.28.norm2.weight torch.Size([1280]) False 339 visual.blocks.28.attn.qkv.weight torch.Size([3840, 1280]) False 340 visual.blocks.28.attn.qkv.bias torch.Size([3840]) False 341 visual.blocks.28.attn.proj.weight torch.Size([1280, 1280]) False 342 visual.blocks.28.attn.proj.bias torch.Size([1280]) False 343 visual.blocks.28.mlp.gate_proj.weight torch.Size([3420, 1280]) False 344 visual.blocks.28.mlp.gate_proj.bias torch.Size([3420]) False 345 visual.blocks.28.mlp.up_proj.weight torch.Size([3420, 1280]) False 346 visual.blocks.28.mlp.up_proj.bias torch.Size([3420]) False 347 visual.blocks.28.mlp.down_proj.weight torch.Size([1280, 3420]) False 348 visual.blocks.28.mlp.down_proj.bias torch.Size([1280]) False 349 visual.blocks.29.norm1.weight torch.Size([1280]) False 350 visual.blocks.29.norm2.weight torch.Size([1280]) False 351 visual.blocks.29.attn.qkv.weight torch.Size([3840, 1280]) False 352 visual.blocks.29.attn.qkv.bias torch.Size([3840]) False 353 visual.blocks.29.attn.proj.weight torch.Size([1280, 1280]) False 354 visual.blocks.29.attn.proj.bias torch.Size([1280]) False 355 visual.blocks.29.mlp.gate_proj.weight torch.Size([3420, 1280]) False 356 visual.blocks.29.mlp.gate_proj.bias torch.Size([3420]) False 357 visual.blocks.29.mlp.up_proj.weight torch.Size([3420, 1280]) False 358 visual.blocks.29.mlp.up_proj.bias torch.Size([3420]) False 359 visual.blocks.29.mlp.down_proj.weight torch.Size([1280, 3420]) False 360 visual.blocks.29.mlp.down_proj.bias torch.Size([1280]) False 361 visual.blocks.30.norm1.weight torch.Size([1280]) False 362 visual.blocks.30.norm2.weight torch.Size([1280]) False 363 visual.blocks.30.attn.qkv.weight torch.Size([3840, 1280]) False 364 visual.blocks.30.attn.qkv.bias torch.Size([3840]) False 365 visual.blocks.30.attn.proj.weight torch.Size([1280, 1280]) False 366 visual.blocks.30.attn.proj.bias torch.Size([1280]) False 367 visual.blocks.30.mlp.gate_proj.weight torch.Size([3420, 1280]) False 368 visual.blocks.30.mlp.gate_proj.bias torch.Size([3420]) False 369 visual.blocks.30.mlp.up_proj.weight torch.Size([3420, 1280]) False 370 visual.blocks.30.mlp.up_proj.bias torch.Size([3420]) False 371 visual.blocks.30.mlp.down_proj.weight torch.Size([1280, 3420]) False 372 visual.blocks.30.mlp.down_proj.bias torch.Size([1280]) False 373 visual.blocks.31.norm1.weight torch.Size([1280]) False 374 visual.blocks.31.norm2.weight torch.Size([1280]) False 375 visual.blocks.31.attn.qkv.weight torch.Size([3840, 1280]) False 376 visual.blocks.31.attn.qkv.bias torch.Size([3840]) False 377 visual.blocks.31.attn.proj.weight torch.Size([1280, 1280]) False 378 visual.blocks.31.attn.proj.bias torch.Size([1280]) False 379 visual.blocks.31.mlp.gate_proj.weight torch.Size([3420, 1280]) False 380 visual.blocks.31.mlp.gate_proj.bias torch.Size([3420]) False 381 visual.blocks.31.mlp.up_proj.weight torch.Size([3420, 1280]) False 382 visual.blocks.31.mlp.up_proj.bias torch.Size([3420]) False 383 visual.blocks.31.mlp.down_proj.weight torch.Size([1280, 3420]) False 384 visual.blocks.31.mlp.down_proj.bias torch.Size([1280]) False 385 visual.merger.ln_q.weight torch.Size([1280]) False 386 visual.merger.mlp.0.weight torch.Size([5120, 5120]) False 387 visual.merger.mlp.0.bias torch.Size([5120]) False 388 visual.merger.mlp.2.weight torch.Size([3584, 5120]) False 389 visual.merger.mlp.2.bias torch.Size([3584]) False 390 model.latent_queries torch.Size([1, 64, 3584]) True 391 model.embed_tokens.weight torch.Size([151668, 3584]) False 392 model.layers.0.self_attn.q_proj.weight torch.Size([3584, 3584]) False 393 model.layers.0.self_attn.q_proj.bias torch.Size([3584]) False 394 model.layers.0.self_attn.k_proj.weight torch.Size([512, 3584]) False 395 model.layers.0.self_attn.k_proj.bias torch.Size([512]) False 396 model.layers.0.self_attn.v_proj.weight torch.Size([512, 3584]) False 397 model.layers.0.self_attn.v_proj.bias torch.Size([512]) False 398 model.layers.0.self_attn.o_proj.weight torch.Size([3584, 3584]) False 399 model.layers.0.mlp.gate_proj.weight torch.Size([18944, 3584]) False 400 model.layers.0.mlp.up_proj.weight torch.Size([18944, 3584]) False 401 model.layers.0.mlp.down_proj.weight torch.Size([3584, 18944]) False 402 model.layers.0.input_layernorm.weight torch.Size([3584]) False 403 model.layers.0.post_attention_layernorm.weight torch.Size([3584]) False 404 model.layers.1.self_attn.q_proj.weight torch.Size([3584, 3584]) False 405 model.layers.1.self_attn.q_proj.bias torch.Size([3584]) False 406 model.layers.1.self_attn.k_proj.weight torch.Size([512, 3584]) False 407 model.layers.1.self_attn.k_proj.bias torch.Size([512]) False 408 model.layers.1.self_attn.v_proj.weight torch.Size([512, 3584]) False 409 model.layers.1.self_attn.v_proj.bias torch.Size([512]) False 410 model.layers.1.self_attn.o_proj.weight torch.Size([3584, 3584]) False 411 model.layers.1.mlp.gate_proj.weight torch.Size([18944, 3584]) False 412 model.layers.1.mlp.up_proj.weight torch.Size([18944, 3584]) False 413 model.layers.1.mlp.down_proj.weight torch.Size([3584, 18944]) False 414 model.layers.1.input_layernorm.weight torch.Size([3584]) False 415 model.layers.1.post_attention_layernorm.weight torch.Size([3584]) False 416 model.layers.2.self_attn.q_proj.weight torch.Size([3584, 3584]) False 417 model.layers.2.self_attn.q_proj.bias torch.Size([3584]) False 418 model.layers.2.self_attn.k_proj.weight torch.Size([512, 3584]) False 419 model.layers.2.self_attn.k_proj.bias torch.Size([512]) False 420 model.layers.2.self_attn.v_proj.weight torch.Size([512, 3584]) False 421 model.layers.2.self_attn.v_proj.bias torch.Size([512]) False 422 model.layers.2.self_attn.o_proj.weight torch.Size([3584, 3584]) False 423 model.layers.2.mlp.gate_proj.weight torch.Size([18944, 3584]) False 424 model.layers.2.mlp.up_proj.weight torch.Size([18944, 3584]) False 425 model.layers.2.mlp.down_proj.weight torch.Size([3584, 18944]) False 426 model.layers.2.input_layernorm.weight torch.Size([3584]) False 427 model.layers.2.post_attention_layernorm.weight torch.Size([3584]) False 428 model.layers.3.self_attn.q_proj.weight torch.Size([3584, 3584]) False 429 model.layers.3.self_attn.q_proj.bias torch.Size([3584]) False 430 model.layers.3.self_attn.k_proj.weight torch.Size([512, 3584]) False 431 model.layers.3.self_attn.k_proj.bias torch.Size([512]) False 432 model.layers.3.self_attn.v_proj.weight torch.Size([512, 3584]) False 433 model.layers.3.self_attn.v_proj.bias torch.Size([512]) False 434 model.layers.3.self_attn.o_proj.weight torch.Size([3584, 3584]) False 435 model.layers.3.mlp.gate_proj.weight torch.Size([18944, 3584]) False 436 model.layers.3.mlp.up_proj.weight torch.Size([18944, 3584]) False 437 model.layers.3.mlp.down_proj.weight torch.Size([3584, 18944]) False 438 model.layers.3.input_layernorm.weight torch.Size([3584]) False 439 model.layers.3.post_attention_layernorm.weight torch.Size([3584]) False 440 model.layers.4.self_attn.q_proj.weight torch.Size([3584, 3584]) False 441 model.layers.4.self_attn.q_proj.bias torch.Size([3584]) False 442 model.layers.4.self_attn.k_proj.weight torch.Size([512, 3584]) False 443 model.layers.4.self_attn.k_proj.bias torch.Size([512]) False 444 model.layers.4.self_attn.v_proj.weight torch.Size([512, 3584]) False 445 model.layers.4.self_attn.v_proj.bias torch.Size([512]) False 446 model.layers.4.self_attn.o_proj.weight torch.Size([3584, 3584]) False 447 model.layers.4.mlp.gate_proj.weight torch.Size([18944, 3584]) False 448 model.layers.4.mlp.up_proj.weight torch.Size([18944, 3584]) False 449 model.layers.4.mlp.down_proj.weight torch.Size([3584, 18944]) False 450 model.layers.4.input_layernorm.weight torch.Size([3584]) False 451 model.layers.4.post_attention_layernorm.weight torch.Size([3584]) False 452 model.layers.5.self_attn.q_proj.weight torch.Size([3584, 3584]) False 453 model.layers.5.self_attn.q_proj.bias torch.Size([3584]) False 454 model.layers.5.self_attn.k_proj.weight torch.Size([512, 3584]) False 455 model.layers.5.self_attn.k_proj.bias torch.Size([512]) False 456 model.layers.5.self_attn.v_proj.weight torch.Size([512, 3584]) False 457 model.layers.5.self_attn.v_proj.bias torch.Size([512]) False 458 model.layers.5.self_attn.o_proj.weight torch.Size([3584, 3584]) False 459 model.layers.5.mlp.gate_proj.weight torch.Size([18944, 3584]) False 460 model.layers.5.mlp.up_proj.weight torch.Size([18944, 3584]) False 461 model.layers.5.mlp.down_proj.weight torch.Size([3584, 18944]) False 462 model.layers.5.input_layernorm.weight torch.Size([3584]) False 463 model.layers.5.post_attention_layernorm.weight torch.Size([3584]) False 464 model.layers.6.self_attn.q_proj.weight torch.Size([3584, 3584]) False 465 model.layers.6.self_attn.q_proj.bias torch.Size([3584]) False 466 model.layers.6.self_attn.k_proj.weight torch.Size([512, 3584]) False 467 model.layers.6.self_attn.k_proj.bias torch.Size([512]) False 468 model.layers.6.self_attn.v_proj.weight torch.Size([512, 3584]) False 469 model.layers.6.self_attn.v_proj.bias torch.Size([512]) False 470 model.layers.6.self_attn.o_proj.weight torch.Size([3584, 3584]) False 471 model.layers.6.mlp.gate_proj.weight torch.Size([18944, 3584]) False 472 model.layers.6.mlp.up_proj.weight torch.Size([18944, 3584]) False 473 model.layers.6.mlp.down_proj.weight torch.Size([3584, 18944]) False 474 model.layers.6.input_layernorm.weight torch.Size([3584]) False 475 model.layers.6.post_attention_layernorm.weight torch.Size([3584]) False 476 model.layers.7.self_attn.q_proj.weight torch.Size([3584, 3584]) False 477 model.layers.7.self_attn.q_proj.bias torch.Size([3584]) False 478 model.layers.7.self_attn.k_proj.weight torch.Size([512, 3584]) False 479 model.layers.7.self_attn.k_proj.bias torch.Size([512]) False 480 model.layers.7.self_attn.v_proj.weight torch.Size([512, 3584]) False 481 model.layers.7.self_attn.v_proj.bias torch.Size([512]) False 482 model.layers.7.self_attn.o_proj.weight torch.Size([3584, 3584]) False 483 model.layers.7.mlp.gate_proj.weight torch.Size([18944, 3584]) False 484 model.layers.7.mlp.up_proj.weight torch.Size([18944, 3584]) False 485 model.layers.7.mlp.down_proj.weight torch.Size([3584, 18944]) False 486 model.layers.7.input_layernorm.weight torch.Size([3584]) False 487 model.layers.7.post_attention_layernorm.weight torch.Size([3584]) False 488 model.layers.8.self_attn.q_proj.weight torch.Size([3584, 3584]) False 489 model.layers.8.self_attn.q_proj.bias torch.Size([3584]) False 490 model.layers.8.self_attn.k_proj.weight torch.Size([512, 3584]) False 491 model.layers.8.self_attn.k_proj.bias torch.Size([512]) False 492 model.layers.8.self_attn.v_proj.weight torch.Size([512, 3584]) False 493 model.layers.8.self_attn.v_proj.bias torch.Size([512]) False 494 model.layers.8.self_attn.o_proj.weight torch.Size([3584, 3584]) False 495 model.layers.8.mlp.gate_proj.weight torch.Size([18944, 3584]) False 496 model.layers.8.mlp.up_proj.weight torch.Size([18944, 3584]) False 497 model.layers.8.mlp.down_proj.weight torch.Size([3584, 18944]) False 498 model.layers.8.input_layernorm.weight torch.Size([3584]) False 499 model.layers.8.post_attention_layernorm.weight torch.Size([3584]) False 500 model.layers.9.self_attn.q_proj.weight torch.Size([3584, 3584]) False 501 model.layers.9.self_attn.q_proj.bias torch.Size([3584]) False 502 model.layers.9.self_attn.k_proj.weight torch.Size([512, 3584]) False 503 model.layers.9.self_attn.k_proj.bias torch.Size([512]) False 504 model.layers.9.self_attn.v_proj.weight torch.Size([512, 3584]) False 505 model.layers.9.self_attn.v_proj.bias torch.Size([512]) False 506 model.layers.9.self_attn.o_proj.weight torch.Size([3584, 3584]) False 507 model.layers.9.mlp.gate_proj.weight torch.Size([18944, 3584]) False 508 model.layers.9.mlp.up_proj.weight torch.Size([18944, 3584]) False 509 model.layers.9.mlp.down_proj.weight torch.Size([3584, 18944]) False 510 model.layers.9.input_layernorm.weight torch.Size([3584]) False 511 model.layers.9.post_attention_layernorm.weight torch.Size([3584]) False 512 model.layers.10.self_attn.q_proj.weight torch.Size([3584, 3584]) False 513 model.layers.10.self_attn.q_proj.bias torch.Size([3584]) False 514 model.layers.10.self_attn.k_proj.weight torch.Size([512, 3584]) False 515 model.layers.10.self_attn.k_proj.bias torch.Size([512]) False 516 model.layers.10.self_attn.v_proj.weight torch.Size([512, 3584]) False 517 model.layers.10.self_attn.v_proj.bias torch.Size([512]) False 518 model.layers.10.self_attn.o_proj.weight torch.Size([3584, 3584]) False 519 model.layers.10.mlp.gate_proj.weight torch.Size([18944, 3584]) False 520 model.layers.10.mlp.up_proj.weight torch.Size([18944, 3584]) False 521 model.layers.10.mlp.down_proj.weight torch.Size([3584, 18944]) False 522 model.layers.10.input_layernorm.weight torch.Size([3584]) False 523 model.layers.10.post_attention_layernorm.weight torch.Size([3584]) False 524 model.layers.11.self_attn.q_proj.weight torch.Size([3584, 3584]) False 525 model.layers.11.self_attn.q_proj.bias torch.Size([3584]) False 526 model.layers.11.self_attn.k_proj.weight torch.Size([512, 3584]) False 527 model.layers.11.self_attn.k_proj.bias torch.Size([512]) False 528 model.layers.11.self_attn.v_proj.weight torch.Size([512, 3584]) False 529 model.layers.11.self_attn.v_proj.bias torch.Size([512]) False 530 model.layers.11.self_attn.o_proj.weight torch.Size([3584, 3584]) False 531 model.layers.11.mlp.gate_proj.weight torch.Size([18944, 3584]) False 532 model.layers.11.mlp.up_proj.weight torch.Size([18944, 3584]) False 533 model.layers.11.mlp.down_proj.weight torch.Size([3584, 18944]) False 534 model.layers.11.input_layernorm.weight torch.Size([3584]) False 535 model.layers.11.post_attention_layernorm.weight torch.Size([3584]) False 536 model.layers.12.self_attn.q_proj.weight torch.Size([3584, 3584]) False 537 model.layers.12.self_attn.q_proj.bias torch.Size([3584]) False 538 model.layers.12.self_attn.k_proj.weight torch.Size([512, 3584]) False 539 model.layers.12.self_attn.k_proj.bias torch.Size([512]) False 540 model.layers.12.self_attn.v_proj.weight torch.Size([512, 3584]) False 541 model.layers.12.self_attn.v_proj.bias torch.Size([512]) False 542 model.layers.12.self_attn.o_proj.weight torch.Size([3584, 3584]) False 543 model.layers.12.mlp.gate_proj.weight torch.Size([18944, 3584]) False 544 model.layers.12.mlp.up_proj.weight torch.Size([18944, 3584]) False 545 model.layers.12.mlp.down_proj.weight torch.Size([3584, 18944]) False 546 model.layers.12.input_layernorm.weight torch.Size([3584]) False 547 model.layers.12.post_attention_layernorm.weight torch.Size([3584]) False 548 model.layers.13.self_attn.q_proj.weight torch.Size([3584, 3584]) False 549 model.layers.13.self_attn.q_proj.bias torch.Size([3584]) False 550 model.layers.13.self_attn.k_proj.weight torch.Size([512, 3584]) False 551 model.layers.13.self_attn.k_proj.bias torch.Size([512]) False 552 model.layers.13.self_attn.v_proj.weight torch.Size([512, 3584]) False 553 model.layers.13.self_attn.v_proj.bias torch.Size([512]) False 554 model.layers.13.self_attn.o_proj.weight torch.Size([3584, 3584]) False 555 model.layers.13.mlp.gate_proj.weight torch.Size([18944, 3584]) False 556 model.layers.13.mlp.up_proj.weight torch.Size([18944, 3584]) False 557 model.layers.13.mlp.down_proj.weight torch.Size([3584, 18944]) False 558 model.layers.13.input_layernorm.weight torch.Size([3584]) False 559 model.layers.13.post_attention_layernorm.weight torch.Size([3584]) False 560 model.layers.14.self_attn.q_proj.weight torch.Size([3584, 3584]) False 561 model.layers.14.self_attn.q_proj.bias torch.Size([3584]) False 562 model.layers.14.self_attn.k_proj.weight torch.Size([512, 3584]) False 563 model.layers.14.self_attn.k_proj.bias torch.Size([512]) False 564 model.layers.14.self_attn.v_proj.weight torch.Size([512, 3584]) False 565 model.layers.14.self_attn.v_proj.bias torch.Size([512]) False 566 model.layers.14.self_attn.o_proj.weight torch.Size([3584, 3584]) False 567 model.layers.14.mlp.gate_proj.weight torch.Size([18944, 3584]) False 568 model.layers.14.mlp.up_proj.weight torch.Size([18944, 3584]) False 569 model.layers.14.mlp.down_proj.weight torch.Size([3584, 18944]) False 570 model.layers.14.input_layernorm.weight torch.Size([3584]) False 571 model.layers.14.post_attention_layernorm.weight torch.Size([3584]) False 572 model.layers.15.self_attn.q_proj.weight torch.Size([3584, 3584]) False 573 model.layers.15.self_attn.q_proj.bias torch.Size([3584]) False 574 model.layers.15.self_attn.k_proj.weight torch.Size([512, 3584]) False 575 model.layers.15.self_attn.k_proj.bias torch.Size([512]) False 576 model.layers.15.self_attn.v_proj.weight torch.Size([512, 3584]) False 577 model.layers.15.self_attn.v_proj.bias torch.Size([512]) False 578 model.layers.15.self_attn.o_proj.weight torch.Size([3584, 3584]) False 579 model.layers.15.mlp.gate_proj.weight torch.Size([18944, 3584]) False 580 model.layers.15.mlp.up_proj.weight torch.Size([18944, 3584]) False 581 model.layers.15.mlp.down_proj.weight torch.Size([3584, 18944]) False 582 model.layers.15.input_layernorm.weight torch.Size([3584]) False 583 model.layers.15.post_attention_layernorm.weight torch.Size([3584]) False 584 model.layers.16.self_attn.q_proj.weight torch.Size([3584, 3584]) False 585 model.layers.16.self_attn.q_proj.bias torch.Size([3584]) False 586 model.layers.16.self_attn.k_proj.weight torch.Size([512, 3584]) False 587 model.layers.16.self_attn.k_proj.bias torch.Size([512]) False 588 model.layers.16.self_attn.v_proj.weight torch.Size([512, 3584]) False 589 model.layers.16.self_attn.v_proj.bias torch.Size([512]) False 590 model.layers.16.self_attn.o_proj.weight torch.Size([3584, 3584]) False 591 model.layers.16.mlp.gate_proj.weight torch.Size([18944, 3584]) False 592 model.layers.16.mlp.up_proj.weight torch.Size([18944, 3584]) False 593 model.layers.16.mlp.down_proj.weight torch.Size([3584, 18944]) False 594 model.layers.16.input_layernorm.weight torch.Size([3584]) False 595 model.layers.16.post_attention_layernorm.weight torch.Size([3584]) False 596 model.layers.17.self_attn.q_proj.weight torch.Size([3584, 3584]) False 597 model.layers.17.self_attn.q_proj.bias torch.Size([3584]) False 598 model.layers.17.self_attn.k_proj.weight torch.Size([512, 3584]) False 599 model.layers.17.self_attn.k_proj.bias torch.Size([512]) False 600 model.layers.17.self_attn.v_proj.weight torch.Size([512, 3584]) False 601 model.layers.17.self_attn.v_proj.bias torch.Size([512]) False 602 model.layers.17.self_attn.o_proj.weight torch.Size([3584, 3584]) False 603 model.layers.17.mlp.gate_proj.weight torch.Size([18944, 3584]) False 604 model.layers.17.mlp.up_proj.weight torch.Size([18944, 3584]) False 605 model.layers.17.mlp.down_proj.weight torch.Size([3584, 18944]) False 606 model.layers.17.input_layernorm.weight torch.Size([3584]) False 607 model.layers.17.post_attention_layernorm.weight torch.Size([3584]) False 608 model.layers.18.self_attn.q_proj.weight torch.Size([3584, 3584]) False 609 model.layers.18.self_attn.q_proj.bias torch.Size([3584]) False 610 model.layers.18.self_attn.k_proj.weight torch.Size([512, 3584]) False 611 model.layers.18.self_attn.k_proj.bias torch.Size([512]) False 612 model.layers.18.self_attn.v_proj.weight torch.Size([512, 3584]) False 613 model.layers.18.self_attn.v_proj.bias torch.Size([512]) False 614 model.layers.18.self_attn.o_proj.weight torch.Size([3584, 3584]) False 615 model.layers.18.mlp.gate_proj.weight torch.Size([18944, 3584]) False 616 model.layers.18.mlp.up_proj.weight torch.Size([18944, 3584]) False 617 model.layers.18.mlp.down_proj.weight torch.Size([3584, 18944]) False 618 model.layers.18.input_layernorm.weight torch.Size([3584]) False 619 model.layers.18.post_attention_layernorm.weight torch.Size([3584]) False 620 model.layers.19.self_attn.q_proj.weight torch.Size([3584, 3584]) False 621 model.layers.19.self_attn.q_proj.bias torch.Size([3584]) False 622 model.layers.19.self_attn.k_proj.weight torch.Size([512, 3584]) False 623 model.layers.19.self_attn.k_proj.bias torch.Size([512]) False 624 model.layers.19.self_attn.v_proj.weight torch.Size([512, 3584]) False 625 model.layers.19.self_attn.v_proj.bias torch.Size([512]) False 626 model.layers.19.self_attn.o_proj.weight torch.Size([3584, 3584]) False 627 model.layers.19.mlp.gate_proj.weight torch.Size([18944, 3584]) False 628 model.layers.19.mlp.up_proj.weight torch.Size([18944, 3584]) False 629 model.layers.19.mlp.down_proj.weight torch.Size([3584, 18944]) False 630 model.layers.19.input_layernorm.weight torch.Size([3584]) False 631 model.layers.19.post_attention_layernorm.weight torch.Size([3584]) False 632 model.layers.20.self_attn.q_proj.weight torch.Size([3584, 3584]) False 633 model.layers.20.self_attn.q_proj.bias torch.Size([3584]) False 634 model.layers.20.self_attn.k_proj.weight torch.Size([512, 3584]) False 635 model.layers.20.self_attn.k_proj.bias torch.Size([512]) False 636 model.layers.20.self_attn.v_proj.weight torch.Size([512, 3584]) False 637 model.layers.20.self_attn.v_proj.bias torch.Size([512]) False 638 model.layers.20.self_attn.o_proj.weight torch.Size([3584, 3584]) False 639 model.layers.20.mlp.gate_proj.weight torch.Size([18944, 3584]) False 640 model.layers.20.mlp.up_proj.weight torch.Size([18944, 3584]) False 641 model.layers.20.mlp.down_proj.weight torch.Size([3584, 18944]) False 642 model.layers.20.input_layernorm.weight torch.Size([3584]) False 643 model.layers.20.post_attention_layernorm.weight torch.Size([3584]) False 644 model.layers.21.self_attn.q_proj.weight torch.Size([3584, 3584]) False 645 model.layers.21.self_attn.q_proj.bias torch.Size([3584]) False 646 model.layers.21.self_attn.k_proj.weight torch.Size([512, 3584]) False 647 model.layers.21.self_attn.k_proj.bias torch.Size([512]) False 648 model.layers.21.self_attn.v_proj.weight torch.Size([512, 3584]) False 649 model.layers.21.self_attn.v_proj.bias torch.Size([512]) False 650 model.layers.21.self_attn.o_proj.weight torch.Size([3584, 3584]) False 651 model.layers.21.mlp.gate_proj.weight torch.Size([18944, 3584]) False 652 model.layers.21.mlp.up_proj.weight torch.Size([18944, 3584]) False 653 model.layers.21.mlp.down_proj.weight torch.Size([3584, 18944]) False 654 model.layers.21.input_layernorm.weight torch.Size([3584]) False 655 model.layers.21.post_attention_layernorm.weight torch.Size([3584]) False 656 model.layers.22.self_attn.q_proj.weight torch.Size([3584, 3584]) False 657 model.layers.22.self_attn.q_proj.bias torch.Size([3584]) False 658 model.layers.22.self_attn.k_proj.weight torch.Size([512, 3584]) False 659 model.layers.22.self_attn.k_proj.bias torch.Size([512]) False 660 model.layers.22.self_attn.v_proj.weight torch.Size([512, 3584]) False 661 model.layers.22.self_attn.v_proj.bias torch.Size([512]) False 662 model.layers.22.self_attn.o_proj.weight torch.Size([3584, 3584]) False 663 model.layers.22.mlp.gate_proj.weight torch.Size([18944, 3584]) False 664 model.layers.22.mlp.up_proj.weight torch.Size([18944, 3584]) False 665 model.layers.22.mlp.down_proj.weight torch.Size([3584, 18944]) False 666 model.layers.22.input_layernorm.weight torch.Size([3584]) False 667 model.layers.22.post_attention_layernorm.weight torch.Size([3584]) False 668 model.layers.23.self_attn.q_proj.weight torch.Size([3584, 3584]) False 669 model.layers.23.self_attn.q_proj.bias torch.Size([3584]) False 670 model.layers.23.self_attn.k_proj.weight torch.Size([512, 3584]) False 671 model.layers.23.self_attn.k_proj.bias torch.Size([512]) False 672 model.layers.23.self_attn.v_proj.weight torch.Size([512, 3584]) False 673 model.layers.23.self_attn.v_proj.bias torch.Size([512]) False 674 model.layers.23.self_attn.o_proj.weight torch.Size([3584, 3584]) False 675 model.layers.23.mlp.gate_proj.weight torch.Size([18944, 3584]) False 676 model.layers.23.mlp.up_proj.weight torch.Size([18944, 3584]) False 677 model.layers.23.mlp.down_proj.weight torch.Size([3584, 18944]) False 678 model.layers.23.input_layernorm.weight torch.Size([3584]) False 679 model.layers.23.post_attention_layernorm.weight torch.Size([3584]) False 680 model.layers.24.self_attn.q_proj.weight torch.Size([3584, 3584]) False 681 model.layers.24.self_attn.q_proj.bias torch.Size([3584]) False 682 model.layers.24.self_attn.k_proj.weight torch.Size([512, 3584]) False 683 model.layers.24.self_attn.k_proj.bias torch.Size([512]) False 684 model.layers.24.self_attn.v_proj.weight torch.Size([512, 3584]) False 685 model.layers.24.self_attn.v_proj.bias torch.Size([512]) False 686 model.layers.24.self_attn.o_proj.weight torch.Size([3584, 3584]) False 687 model.layers.24.mlp.gate_proj.weight torch.Size([18944, 3584]) False 688 model.layers.24.mlp.up_proj.weight torch.Size([18944, 3584]) False 689 model.layers.24.mlp.down_proj.weight torch.Size([3584, 18944]) False 690 model.layers.24.input_layernorm.weight torch.Size([3584]) False 691 model.layers.24.post_attention_layernorm.weight torch.Size([3584]) False 692 model.layers.25.self_attn.q_proj.weight torch.Size([3584, 3584]) False 693 model.layers.25.self_attn.q_proj.bias torch.Size([3584]) False 694 model.layers.25.self_attn.k_proj.weight torch.Size([512, 3584]) False 695 model.layers.25.self_attn.k_proj.bias torch.Size([512]) False 696 model.layers.25.self_attn.v_proj.weight torch.Size([512, 3584]) False 697 model.layers.25.self_attn.v_proj.bias torch.Size([512]) False 698 model.layers.25.self_attn.o_proj.weight torch.Size([3584, 3584]) False 699 model.layers.25.mlp.gate_proj.weight torch.Size([18944, 3584]) False 700 model.layers.25.mlp.up_proj.weight torch.Size([18944, 3584]) False 701 model.layers.25.mlp.down_proj.weight torch.Size([3584, 18944]) False 702 model.layers.25.input_layernorm.weight torch.Size([3584]) False 703 model.layers.25.post_attention_layernorm.weight torch.Size([3584]) False 704 model.layers.26.self_attn.q_proj.weight torch.Size([3584, 3584]) False 705 model.layers.26.self_attn.q_proj.bias torch.Size([3584]) False 706 model.layers.26.self_attn.k_proj.weight torch.Size([512, 3584]) False 707 model.layers.26.self_attn.k_proj.bias torch.Size([512]) False 708 model.layers.26.self_attn.v_proj.weight torch.Size([512, 3584]) False 709 model.layers.26.self_attn.v_proj.bias torch.Size([512]) False 710 model.layers.26.self_attn.o_proj.weight torch.Size([3584, 3584]) False 711 model.layers.26.mlp.gate_proj.weight torch.Size([18944, 3584]) False 712 model.layers.26.mlp.up_proj.weight torch.Size([18944, 3584]) False 713 model.layers.26.mlp.down_proj.weight torch.Size([3584, 18944]) False 714 model.layers.26.input_layernorm.weight torch.Size([3584]) False 715 model.layers.26.post_attention_layernorm.weight torch.Size([3584]) False 716 model.layers.27.self_attn.q_proj.weight torch.Size([3584, 3584]) False 717 model.layers.27.self_attn.q_proj.bias torch.Size([3584]) False 718 model.layers.27.self_attn.k_proj.weight torch.Size([512, 3584]) False 719 model.layers.27.self_attn.k_proj.bias torch.Size([512]) False 720 model.layers.27.self_attn.v_proj.weight torch.Size([512, 3584]) False 721 model.layers.27.self_attn.v_proj.bias torch.Size([512]) False 722 model.layers.27.self_attn.o_proj.weight torch.Size([3584, 3584]) False 723 model.layers.27.mlp.gate_proj.weight torch.Size([18944, 3584]) False 724 model.layers.27.mlp.up_proj.weight torch.Size([18944, 3584]) False 725 model.layers.27.mlp.down_proj.weight torch.Size([3584, 18944]) False 726 model.layers.27.input_layernorm.weight torch.Size([3584]) False 727 model.layers.27.post_attention_layernorm.weight torch.Size([3584]) False 728 model.norm.weight torch.Size([3584]) False 729 model.dit.model.caption_projection.linear_1.weight torch.Size([1792, 3584]) True 730 model.dit.model.caption_projection.linear_1.bias torch.Size([1792]) True 731 model.dit.model.caption_projection.linear_2.weight torch.Size([1792, 1792]) True 732 model.dit.model.caption_projection.linear_2.bias torch.Size([1792]) True 733 model.dit.model.patch_embedder.proj.weight torch.Size([1792, 1792]) True 734 model.dit.model.patch_embedder.proj.bias torch.Size([1792]) True 735 model.dit.model.time_caption_embed.timestep_embedder.linear_1.weight torch.Size([1024, 256]) True 736 model.dit.model.time_caption_embed.timestep_embedder.linear_1.bias torch.Size([1024]) True 737 model.dit.model.time_caption_embed.timestep_embedder.linear_2.weight torch.Size([1024, 1024]) True 738 model.dit.model.time_caption_embed.timestep_embedder.linear_2.bias torch.Size([1024]) True 739 model.dit.model.time_caption_embed.caption_embedder.0.weight torch.Size([1792]) True 740 model.dit.model.time_caption_embed.caption_embedder.0.bias torch.Size([1792]) True 741 model.dit.model.time_caption_embed.caption_embedder.1.weight torch.Size([1024, 1792]) True 742 model.dit.model.time_caption_embed.caption_embedder.1.bias torch.Size([1024]) True 743 model.dit.model.layers.0.gate torch.Size([28]) True 744 model.dit.model.layers.0.attn1.norm_q.weight torch.Size([1792]) True 745 model.dit.model.layers.0.attn1.norm_q.bias torch.Size([1792]) True 746 model.dit.model.layers.0.attn1.norm_k.weight torch.Size([1792]) True 747 model.dit.model.layers.0.attn1.norm_k.bias torch.Size([1792]) True 748 model.dit.model.layers.0.attn1.to_q.weight torch.Size([1792, 1792]) True 749 model.dit.model.layers.0.attn1.to_k.weight torch.Size([1792, 1792]) True 750 model.dit.model.layers.0.attn1.to_v.weight torch.Size([1792, 1792]) True 751 model.dit.model.layers.0.attn2.norm_q.weight torch.Size([1792]) True 752 model.dit.model.layers.0.attn2.norm_q.bias torch.Size([1792]) True 753 model.dit.model.layers.0.attn2.norm_k.weight torch.Size([1792]) True 754 model.dit.model.layers.0.attn2.norm_k.bias torch.Size([1792]) True 755 model.dit.model.layers.0.attn2.to_q.weight torch.Size([1792, 1792]) True 756 model.dit.model.layers.0.attn2.to_k.weight torch.Size([1792, 1792]) True 757 model.dit.model.layers.0.attn2.to_v.weight torch.Size([1792, 1792]) True 758 model.dit.model.layers.0.attn2.to_out.0.weight torch.Size([1792, 1792]) True 759 model.dit.model.layers.0.feed_forward.linear_1.weight torch.Size([4864, 1792]) True 760 model.dit.model.layers.0.feed_forward.linear_2.weight torch.Size([1792, 4864]) True 761 model.dit.model.layers.0.feed_forward.linear_3.weight torch.Size([4864, 1792]) True 762 model.dit.model.layers.0.norm1.linear.weight torch.Size([7168, 1024]) True 763 model.dit.model.layers.0.norm1.linear.bias torch.Size([7168]) True 764 model.dit.model.layers.0.norm1.norm.weight torch.Size([1792]) True 765 model.dit.model.layers.0.ffn_norm1.weight torch.Size([1792]) True 766 model.dit.model.layers.0.norm2.weight torch.Size([1792]) True 767 model.dit.model.layers.0.ffn_norm2.weight torch.Size([1792]) True 768 model.dit.model.layers.0.norm1_context.weight torch.Size([1792]) True 769 model.dit.model.layers.1.gate torch.Size([28]) True 770 model.dit.model.layers.1.attn1.norm_q.weight torch.Size([1792]) True 771 model.dit.model.layers.1.attn1.norm_q.bias torch.Size([1792]) True 772 model.dit.model.layers.1.attn1.norm_k.weight torch.Size([1792]) True 773 model.dit.model.layers.1.attn1.norm_k.bias torch.Size([1792]) True 774 model.dit.model.layers.1.attn1.to_q.weight torch.Size([1792, 1792]) True 775 model.dit.model.layers.1.attn1.to_k.weight torch.Size([1792, 1792]) True 776 model.dit.model.layers.1.attn1.to_v.weight torch.Size([1792, 1792]) True 777 model.dit.model.layers.1.attn2.norm_q.weight torch.Size([1792]) True 778 model.dit.model.layers.1.attn2.norm_q.bias torch.Size([1792]) True 779 model.dit.model.layers.1.attn2.norm_k.weight torch.Size([1792]) True 780 model.dit.model.layers.1.attn2.norm_k.bias torch.Size([1792]) True 781 model.dit.model.layers.1.attn2.to_q.weight torch.Size([1792, 1792]) True 782 model.dit.model.layers.1.attn2.to_k.weight torch.Size([1792, 1792]) True 783 model.dit.model.layers.1.attn2.to_v.weight torch.Size([1792, 1792]) True 784 model.dit.model.layers.1.attn2.to_out.0.weight torch.Size([1792, 1792]) True 785 model.dit.model.layers.1.feed_forward.linear_1.weight torch.Size([4864, 1792]) True 786 model.dit.model.layers.1.feed_forward.linear_2.weight torch.Size([1792, 4864]) True 787 model.dit.model.layers.1.feed_forward.linear_3.weight torch.Size([4864, 1792]) True 788 model.dit.model.layers.1.norm1.linear.weight torch.Size([7168, 1024]) True 789 model.dit.model.layers.1.norm1.linear.bias torch.Size([7168]) True 790 model.dit.model.layers.1.norm1.norm.weight torch.Size([1792]) True 791 model.dit.model.layers.1.ffn_norm1.weight torch.Size([1792]) True 792 model.dit.model.layers.1.norm2.weight torch.Size([1792]) True 793 model.dit.model.layers.1.ffn_norm2.weight torch.Size([1792]) True 794 model.dit.model.layers.1.norm1_context.weight torch.Size([1792]) True 795 model.dit.model.layers.2.gate torch.Size([28]) True 796 model.dit.model.layers.2.attn1.norm_q.weight torch.Size([1792]) True 797 model.dit.model.layers.2.attn1.norm_q.bias torch.Size([1792]) True 798 model.dit.model.layers.2.attn1.norm_k.weight torch.Size([1792]) True 799 model.dit.model.layers.2.attn1.norm_k.bias torch.Size([1792]) True 800 model.dit.model.layers.2.attn1.to_q.weight torch.Size([1792, 1792]) True 801 model.dit.model.layers.2.attn1.to_k.weight torch.Size([1792, 1792]) True 802 model.dit.model.layers.2.attn1.to_v.weight torch.Size([1792, 1792]) True 803 model.dit.model.layers.2.attn2.norm_q.weight torch.Size([1792]) True 804 model.dit.model.layers.2.attn2.norm_q.bias torch.Size([1792]) True 805 model.dit.model.layers.2.attn2.norm_k.weight torch.Size([1792]) True 806 model.dit.model.layers.2.attn2.norm_k.bias torch.Size([1792]) True 807 model.dit.model.layers.2.attn2.to_q.weight torch.Size([1792, 1792]) True 808 model.dit.model.layers.2.attn2.to_k.weight torch.Size([1792, 1792]) True 809 model.dit.model.layers.2.attn2.to_v.weight torch.Size([1792, 1792]) True 810 model.dit.model.layers.2.attn2.to_out.0.weight torch.Size([1792, 1792]) True 811 model.dit.model.layers.2.feed_forward.linear_1.weight torch.Size([4864, 1792]) True 812 model.dit.model.layers.2.feed_forward.linear_2.weight torch.Size([1792, 4864]) True 813 model.dit.model.layers.2.feed_forward.linear_3.weight torch.Size([4864, 1792]) True 814 model.dit.model.layers.2.norm1.linear.weight torch.Size([7168, 1024]) True 815 model.dit.model.layers.2.norm1.linear.bias torch.Size([7168]) True 816 model.dit.model.layers.2.norm1.norm.weight torch.Size([1792]) True 817 model.dit.model.layers.2.ffn_norm1.weight torch.Size([1792]) True 818 model.dit.model.layers.2.norm2.weight torch.Size([1792]) True 819 model.dit.model.layers.2.ffn_norm2.weight torch.Size([1792]) True 820 model.dit.model.layers.2.norm1_context.weight torch.Size([1792]) True 821 model.dit.model.layers.3.gate torch.Size([28]) True 822 model.dit.model.layers.3.attn1.norm_q.weight torch.Size([1792]) True 823 model.dit.model.layers.3.attn1.norm_q.bias torch.Size([1792]) True 824 model.dit.model.layers.3.attn1.norm_k.weight torch.Size([1792]) True 825 model.dit.model.layers.3.attn1.norm_k.bias torch.Size([1792]) True 826 model.dit.model.layers.3.attn1.to_q.weight torch.Size([1792, 1792]) True 827 model.dit.model.layers.3.attn1.to_k.weight torch.Size([1792, 1792]) True 828 model.dit.model.layers.3.attn1.to_v.weight torch.Size([1792, 1792]) True 829 model.dit.model.layers.3.attn2.norm_q.weight torch.Size([1792]) True 830 model.dit.model.layers.3.attn2.norm_q.bias torch.Size([1792]) True 831 model.dit.model.layers.3.attn2.norm_k.weight torch.Size([1792]) True 832 model.dit.model.layers.3.attn2.norm_k.bias torch.Size([1792]) True 833 model.dit.model.layers.3.attn2.to_q.weight torch.Size([1792, 1792]) True 834 model.dit.model.layers.3.attn2.to_k.weight torch.Size([1792, 1792]) True 835 model.dit.model.layers.3.attn2.to_v.weight torch.Size([1792, 1792]) True 836 model.dit.model.layers.3.attn2.to_out.0.weight torch.Size([1792, 1792]) True 837 model.dit.model.layers.3.feed_forward.linear_1.weight torch.Size([4864, 1792]) True 838 model.dit.model.layers.3.feed_forward.linear_2.weight torch.Size([1792, 4864]) True 839 model.dit.model.layers.3.feed_forward.linear_3.weight torch.Size([4864, 1792]) True 840 model.dit.model.layers.3.norm1.linear.weight torch.Size([7168, 1024]) True 841 model.dit.model.layers.3.norm1.linear.bias torch.Size([7168]) True 842 model.dit.model.layers.3.norm1.norm.weight torch.Size([1792]) True 843 model.dit.model.layers.3.ffn_norm1.weight torch.Size([1792]) True 844 model.dit.model.layers.3.norm2.weight torch.Size([1792]) True 845 model.dit.model.layers.3.ffn_norm2.weight torch.Size([1792]) True 846 model.dit.model.layers.3.norm1_context.weight torch.Size([1792]) True 847 model.dit.model.layers.4.gate torch.Size([28]) True 848 model.dit.model.layers.4.attn1.norm_q.weight torch.Size([1792]) True 849 model.dit.model.layers.4.attn1.norm_q.bias torch.Size([1792]) True 850 model.dit.model.layers.4.attn1.norm_k.weight torch.Size([1792]) True 851 model.dit.model.layers.4.attn1.norm_k.bias torch.Size([1792]) True 852 model.dit.model.layers.4.attn1.to_q.weight torch.Size([1792, 1792]) True 853 model.dit.model.layers.4.attn1.to_k.weight torch.Size([1792, 1792]) True 854 model.dit.model.layers.4.attn1.to_v.weight torch.Size([1792, 1792]) True 855 model.dit.model.layers.4.attn2.norm_q.weight torch.Size([1792]) True 856 model.dit.model.layers.4.attn2.norm_q.bias torch.Size([1792]) True 857 model.dit.model.layers.4.attn2.norm_k.weight torch.Size([1792]) True 858 model.dit.model.layers.4.attn2.norm_k.bias torch.Size([1792]) True 859 model.dit.model.layers.4.attn2.to_q.weight torch.Size([1792, 1792]) True 860 model.dit.model.layers.4.attn2.to_k.weight torch.Size([1792, 1792]) True 861 model.dit.model.layers.4.attn2.to_v.weight torch.Size([1792, 1792]) True 862 model.dit.model.layers.4.attn2.to_out.0.weight torch.Size([1792, 1792]) True 863 model.dit.model.layers.4.feed_forward.linear_1.weight torch.Size([4864, 1792]) True 864 model.dit.model.layers.4.feed_forward.linear_2.weight torch.Size([1792, 4864]) True 865 model.dit.model.layers.4.feed_forward.linear_3.weight torch.Size([4864, 1792]) True 866 model.dit.model.layers.4.norm1.linear.weight torch.Size([7168, 1024]) True 867 model.dit.model.layers.4.norm1.linear.bias torch.Size([7168]) True 868 model.dit.model.layers.4.norm1.norm.weight torch.Size([1792]) True 869 model.dit.model.layers.4.ffn_norm1.weight torch.Size([1792]) True 870 model.dit.model.layers.4.norm2.weight torch.Size([1792]) True 871 model.dit.model.layers.4.ffn_norm2.weight torch.Size([1792]) True 872 model.dit.model.layers.4.norm1_context.weight torch.Size([1792]) True 873 model.dit.model.layers.5.gate torch.Size([28]) True 874 model.dit.model.layers.5.attn1.norm_q.weight torch.Size([1792]) True 875 model.dit.model.layers.5.attn1.norm_q.bias torch.Size([1792]) True 876 model.dit.model.layers.5.attn1.norm_k.weight torch.Size([1792]) True 877 model.dit.model.layers.5.attn1.norm_k.bias torch.Size([1792]) True 878 model.dit.model.layers.5.attn1.to_q.weight torch.Size([1792, 1792]) True 879 model.dit.model.layers.5.attn1.to_k.weight torch.Size([1792, 1792]) True 880 model.dit.model.layers.5.attn1.to_v.weight torch.Size([1792, 1792]) True 881 model.dit.model.layers.5.attn2.norm_q.weight torch.Size([1792]) True 882 model.dit.model.layers.5.attn2.norm_q.bias torch.Size([1792]) True 883 model.dit.model.layers.5.attn2.norm_k.weight torch.Size([1792]) True 884 model.dit.model.layers.5.attn2.norm_k.bias torch.Size([1792]) True 885 model.dit.model.layers.5.attn2.to_q.weight torch.Size([1792, 1792]) True 886 model.dit.model.layers.5.attn2.to_k.weight torch.Size([1792, 1792]) True 887 model.dit.model.layers.5.attn2.to_v.weight torch.Size([1792, 1792]) True 888 model.dit.model.layers.5.attn2.to_out.0.weight torch.Size([1792, 1792]) True 889 model.dit.model.layers.5.feed_forward.linear_1.weight torch.Size([4864, 1792]) True 890 model.dit.model.layers.5.feed_forward.linear_2.weight torch.Size([1792, 4864]) True 891 model.dit.model.layers.5.feed_forward.linear_3.weight torch.Size([4864, 1792]) True 892 model.dit.model.layers.5.norm1.linear.weight torch.Size([7168, 1024]) True 893 model.dit.model.layers.5.norm1.linear.bias torch.Size([7168]) True 894 model.dit.model.layers.5.norm1.norm.weight torch.Size([1792]) True 895 model.dit.model.layers.5.ffn_norm1.weight torch.Size([1792]) True 896 model.dit.model.layers.5.norm2.weight torch.Size([1792]) True 897 model.dit.model.layers.5.ffn_norm2.weight torch.Size([1792]) True 898 model.dit.model.layers.5.norm1_context.weight torch.Size([1792]) True 899 model.dit.model.layers.6.gate torch.Size([28]) True 900 model.dit.model.layers.6.attn1.norm_q.weight torch.Size([1792]) True 901 model.dit.model.layers.6.attn1.norm_q.bias torch.Size([1792]) True 902 model.dit.model.layers.6.attn1.norm_k.weight torch.Size([1792]) True 903 model.dit.model.layers.6.attn1.norm_k.bias torch.Size([1792]) True 904 model.dit.model.layers.6.attn1.to_q.weight torch.Size([1792, 1792]) True 905 model.dit.model.layers.6.attn1.to_k.weight torch.Size([1792, 1792]) True 906 model.dit.model.layers.6.attn1.to_v.weight torch.Size([1792, 1792]) True 907 model.dit.model.layers.6.attn2.norm_q.weight torch.Size([1792]) True 908 model.dit.model.layers.6.attn2.norm_q.bias torch.Size([1792]) True 909 model.dit.model.layers.6.attn2.norm_k.weight torch.Size([1792]) True 910 model.dit.model.layers.6.attn2.norm_k.bias torch.Size([1792]) True 911 model.dit.model.layers.6.attn2.to_q.weight torch.Size([1792, 1792]) True 912 model.dit.model.layers.6.attn2.to_k.weight torch.Size([1792, 1792]) True 913 model.dit.model.layers.6.attn2.to_v.weight torch.Size([1792, 1792]) True 914 model.dit.model.layers.6.attn2.to_out.0.weight torch.Size([1792, 1792]) True 915 model.dit.model.layers.6.feed_forward.linear_1.weight torch.Size([4864, 1792]) True 916 model.dit.model.layers.6.feed_forward.linear_2.weight torch.Size([1792, 4864]) True 917 model.dit.model.layers.6.feed_forward.linear_3.weight torch.Size([4864, 1792]) True 918 model.dit.model.layers.6.norm1.linear.weight torch.Size([7168, 1024]) True 919 model.dit.model.layers.6.norm1.linear.bias torch.Size([7168]) True 920 model.dit.model.layers.6.norm1.norm.weight torch.Size([1792]) True 921 model.dit.model.layers.6.ffn_norm1.weight torch.Size([1792]) True 922 model.dit.model.layers.6.norm2.weight torch.Size([1792]) True 923 model.dit.model.layers.6.ffn_norm2.weight torch.Size([1792]) True 924 model.dit.model.layers.6.norm1_context.weight torch.Size([1792]) True 925 model.dit.model.layers.7.gate torch.Size([28]) True 926 model.dit.model.layers.7.attn1.norm_q.weight torch.Size([1792]) True 927 model.dit.model.layers.7.attn1.norm_q.bias torch.Size([1792]) True 928 model.dit.model.layers.7.attn1.norm_k.weight torch.Size([1792]) True 929 model.dit.model.layers.7.attn1.norm_k.bias torch.Size([1792]) True 930 model.dit.model.layers.7.attn1.to_q.weight torch.Size([1792, 1792]) True 931 model.dit.model.layers.7.attn1.to_k.weight torch.Size([1792, 1792]) True 932 model.dit.model.layers.7.attn1.to_v.weight torch.Size([1792, 1792]) True 933 model.dit.model.layers.7.attn2.norm_q.weight torch.Size([1792]) True 934 model.dit.model.layers.7.attn2.norm_q.bias torch.Size([1792]) True 935 model.dit.model.layers.7.attn2.norm_k.weight torch.Size([1792]) True 936 model.dit.model.layers.7.attn2.norm_k.bias torch.Size([1792]) True 937 model.dit.model.layers.7.attn2.to_q.weight torch.Size([1792, 1792]) True 938 model.dit.model.layers.7.attn2.to_k.weight torch.Size([1792, 1792]) True 939 model.dit.model.layers.7.attn2.to_v.weight torch.Size([1792, 1792]) True 940 model.dit.model.layers.7.attn2.to_out.0.weight torch.Size([1792, 1792]) True 941 model.dit.model.layers.7.feed_forward.linear_1.weight torch.Size([4864, 1792]) True 942 model.dit.model.layers.7.feed_forward.linear_2.weight torch.Size([1792, 4864]) True 943 model.dit.model.layers.7.feed_forward.linear_3.weight torch.Size([4864, 1792]) True 944 model.dit.model.layers.7.norm1.linear.weight torch.Size([7168, 1024]) True 945 model.dit.model.layers.7.norm1.linear.bias torch.Size([7168]) True 946 model.dit.model.layers.7.norm1.norm.weight torch.Size([1792]) True 947 model.dit.model.layers.7.ffn_norm1.weight torch.Size([1792]) True 948 model.dit.model.layers.7.norm2.weight torch.Size([1792]) True 949 model.dit.model.layers.7.ffn_norm2.weight torch.Size([1792]) True 950 model.dit.model.layers.7.norm1_context.weight torch.Size([1792]) True 951 model.dit.model.layers.8.gate torch.Size([28]) True 952 model.dit.model.layers.8.attn1.norm_q.weight torch.Size([1792]) True 953 model.dit.model.layers.8.attn1.norm_q.bias torch.Size([1792]) True 954 model.dit.model.layers.8.attn1.norm_k.weight torch.Size([1792]) True 955 model.dit.model.layers.8.attn1.norm_k.bias torch.Size([1792]) True 956 model.dit.model.layers.8.attn1.to_q.weight torch.Size([1792, 1792]) True 957 model.dit.model.layers.8.attn1.to_k.weight torch.Size([1792, 1792]) True 958 model.dit.model.layers.8.attn1.to_v.weight torch.Size([1792, 1792]) True 959 model.dit.model.layers.8.attn2.norm_q.weight torch.Size([1792]) True 960 model.dit.model.layers.8.attn2.norm_q.bias torch.Size([1792]) True 961 model.dit.model.layers.8.attn2.norm_k.weight torch.Size([1792]) True 962 model.dit.model.layers.8.attn2.norm_k.bias torch.Size([1792]) True 963 model.dit.model.layers.8.attn2.to_q.weight torch.Size([1792, 1792]) True 964 model.dit.model.layers.8.attn2.to_k.weight torch.Size([1792, 1792]) True 965 model.dit.model.layers.8.attn2.to_v.weight torch.Size([1792, 1792]) True 966 model.dit.model.layers.8.attn2.to_out.0.weight torch.Size([1792, 1792]) True 967 model.dit.model.layers.8.feed_forward.linear_1.weight torch.Size([4864, 1792]) True 968 model.dit.model.layers.8.feed_forward.linear_2.weight torch.Size([1792, 4864]) True 969 model.dit.model.layers.8.feed_forward.linear_3.weight torch.Size([4864, 1792]) True 970 model.dit.model.layers.8.norm1.linear.weight torch.Size([7168, 1024]) True 971 model.dit.model.layers.8.norm1.linear.bias torch.Size([7168]) True 972 model.dit.model.layers.8.norm1.norm.weight torch.Size([1792]) True 973 model.dit.model.layers.8.ffn_norm1.weight torch.Size([1792]) True 974 model.dit.model.layers.8.norm2.weight torch.Size([1792]) True 975 model.dit.model.layers.8.ffn_norm2.weight torch.Size([1792]) True 976 model.dit.model.layers.8.norm1_context.weight torch.Size([1792]) True 977 model.dit.model.layers.9.gate torch.Size([28]) True 978 model.dit.model.layers.9.attn1.norm_q.weight torch.Size([1792]) True 979 model.dit.model.layers.9.attn1.norm_q.bias torch.Size([1792]) True 980 model.dit.model.layers.9.attn1.norm_k.weight torch.Size([1792]) True 981 model.dit.model.layers.9.attn1.norm_k.bias torch.Size([1792]) True 982 model.dit.model.layers.9.attn1.to_q.weight torch.Size([1792, 1792]) True 983 model.dit.model.layers.9.attn1.to_k.weight torch.Size([1792, 1792]) True 984 model.dit.model.layers.9.attn1.to_v.weight torch.Size([1792, 1792]) True 985 model.dit.model.layers.9.attn2.norm_q.weight torch.Size([1792]) True 986 model.dit.model.layers.9.attn2.norm_q.bias torch.Size([1792]) True 987 model.dit.model.layers.9.attn2.norm_k.weight torch.Size([1792]) True 988 model.dit.model.layers.9.attn2.norm_k.bias torch.Size([1792]) True 989 model.dit.model.layers.9.attn2.to_q.weight torch.Size([1792, 1792]) True 990 model.dit.model.layers.9.attn2.to_k.weight torch.Size([1792, 1792]) True 991 model.dit.model.layers.9.attn2.to_v.weight torch.Size([1792, 1792]) True 992 model.dit.model.layers.9.attn2.to_out.0.weight torch.Size([1792, 1792]) True 993 model.dit.model.layers.9.feed_forward.linear_1.weight torch.Size([4864, 1792]) True 994 model.dit.model.layers.9.feed_forward.linear_2.weight torch.Size([1792, 4864]) True 995 model.dit.model.layers.9.feed_forward.linear_3.weight torch.Size([4864, 1792]) True 996 model.dit.model.layers.9.norm1.linear.weight torch.Size([7168, 1024]) True 997 model.dit.model.layers.9.norm1.linear.bias torch.Size([7168]) True 998 model.dit.model.layers.9.norm1.norm.weight torch.Size([1792]) True 999 model.dit.model.layers.9.ffn_norm1.weight torch.Size([1792]) True 1000 model.dit.model.layers.9.norm2.weight torch.Size([1792]) True 1001 model.dit.model.layers.9.ffn_norm2.weight torch.Size([1792]) True 1002 model.dit.model.layers.9.norm1_context.weight torch.Size([1792]) True 1003 model.dit.model.layers.10.gate torch.Size([28]) True 1004 model.dit.model.layers.10.attn1.norm_q.weight torch.Size([1792]) True 1005 model.dit.model.layers.10.attn1.norm_q.bias torch.Size([1792]) True 1006 model.dit.model.layers.10.attn1.norm_k.weight torch.Size([1792]) True 1007 model.dit.model.layers.10.attn1.norm_k.bias torch.Size([1792]) True 1008 model.dit.model.layers.10.attn1.to_q.weight torch.Size([1792, 1792]) True 1009 model.dit.model.layers.10.attn1.to_k.weight torch.Size([1792, 1792]) True 1010 model.dit.model.layers.10.attn1.to_v.weight torch.Size([1792, 1792]) True 1011 model.dit.model.layers.10.attn2.norm_q.weight torch.Size([1792]) True 1012 model.dit.model.layers.10.attn2.norm_q.bias torch.Size([1792]) True 1013 model.dit.model.layers.10.attn2.norm_k.weight torch.Size([1792]) True 1014 model.dit.model.layers.10.attn2.norm_k.bias torch.Size([1792]) True 1015 model.dit.model.layers.10.attn2.to_q.weight torch.Size([1792, 1792]) True 1016 model.dit.model.layers.10.attn2.to_k.weight torch.Size([1792, 1792]) True 1017 model.dit.model.layers.10.attn2.to_v.weight torch.Size([1792, 1792]) True 1018 model.dit.model.layers.10.attn2.to_out.0.weight torch.Size([1792, 1792]) True 1019 model.dit.model.layers.10.feed_forward.linear_1.weight torch.Size([4864, 1792]) True 1020 model.dit.model.layers.10.feed_forward.linear_2.weight torch.Size([1792, 4864]) True 1021 model.dit.model.layers.10.feed_forward.linear_3.weight torch.Size([4864, 1792]) True 1022 model.dit.model.layers.10.norm1.linear.weight torch.Size([7168, 1024]) True 1023 model.dit.model.layers.10.norm1.linear.bias torch.Size([7168]) True 1024 model.dit.model.layers.10.norm1.norm.weight torch.Size([1792]) True 1025 model.dit.model.layers.10.ffn_norm1.weight torch.Size([1792]) True 1026 model.dit.model.layers.10.norm2.weight torch.Size([1792]) True 1027 model.dit.model.layers.10.ffn_norm2.weight torch.Size([1792]) True 1028 model.dit.model.layers.10.norm1_context.weight torch.Size([1792]) True 1029 model.dit.model.layers.11.gate torch.Size([28]) True 1030 model.dit.model.layers.11.attn1.norm_q.weight torch.Size([1792]) True 1031 model.dit.model.layers.11.attn1.norm_q.bias torch.Size([1792]) True 1032 model.dit.model.layers.11.attn1.norm_k.weight torch.Size([1792]) True 1033 model.dit.model.layers.11.attn1.norm_k.bias torch.Size([1792]) True 1034 model.dit.model.layers.11.attn1.to_q.weight torch.Size([1792, 1792]) True 1035 model.dit.model.layers.11.attn1.to_k.weight torch.Size([1792, 1792]) True 1036 model.dit.model.layers.11.attn1.to_v.weight torch.Size([1792, 1792]) True 1037 model.dit.model.layers.11.attn2.norm_q.weight torch.Size([1792]) True 1038 model.dit.model.layers.11.attn2.norm_q.bias torch.Size([1792]) True 1039 model.dit.model.layers.11.attn2.norm_k.weight torch.Size([1792]) True 1040 model.dit.model.layers.11.attn2.norm_k.bias torch.Size([1792]) True 1041 model.dit.model.layers.11.attn2.to_q.weight torch.Size([1792, 1792]) True 1042 model.dit.model.layers.11.attn2.to_k.weight torch.Size([1792, 1792]) True 1043 model.dit.model.layers.11.attn2.to_v.weight torch.Size([1792, 1792]) True 1044 model.dit.model.layers.11.attn2.to_out.0.weight torch.Size([1792, 1792]) True 1045 model.dit.model.layers.11.feed_forward.linear_1.weight torch.Size([4864, 1792]) True 1046 model.dit.model.layers.11.feed_forward.linear_2.weight torch.Size([1792, 4864]) True 1047 model.dit.model.layers.11.feed_forward.linear_3.weight torch.Size([4864, 1792]) True 1048 model.dit.model.layers.11.norm1.linear.weight torch.Size([7168, 1024]) True 1049 model.dit.model.layers.11.norm1.linear.bias torch.Size([7168]) True 1050 model.dit.model.layers.11.norm1.norm.weight torch.Size([1792]) True 1051 model.dit.model.layers.11.ffn_norm1.weight torch.Size([1792]) True 1052 model.dit.model.layers.11.norm2.weight torch.Size([1792]) True 1053 model.dit.model.layers.11.ffn_norm2.weight torch.Size([1792]) True 1054 model.dit.model.layers.11.norm1_context.weight torch.Size([1792]) True 1055 model.dit.model.layers.12.gate torch.Size([28]) True 1056 model.dit.model.layers.12.attn1.norm_q.weight torch.Size([1792]) True 1057 model.dit.model.layers.12.attn1.norm_q.bias torch.Size([1792]) True 1058 model.dit.model.layers.12.attn1.norm_k.weight torch.Size([1792]) True 1059 model.dit.model.layers.12.attn1.norm_k.bias torch.Size([1792]) True 1060 model.dit.model.layers.12.attn1.to_q.weight torch.Size([1792, 1792]) True 1061 model.dit.model.layers.12.attn1.to_k.weight torch.Size([1792, 1792]) True 1062 model.dit.model.layers.12.attn1.to_v.weight torch.Size([1792, 1792]) True 1063 model.dit.model.layers.12.attn2.norm_q.weight torch.Size([1792]) True 1064 model.dit.model.layers.12.attn2.norm_q.bias torch.Size([1792]) True 1065 model.dit.model.layers.12.attn2.norm_k.weight torch.Size([1792]) True 1066 model.dit.model.layers.12.attn2.norm_k.bias torch.Size([1792]) True 1067 model.dit.model.layers.12.attn2.to_q.weight torch.Size([1792, 1792]) True 1068 model.dit.model.layers.12.attn2.to_k.weight torch.Size([1792, 1792]) True 1069 model.dit.model.layers.12.attn2.to_v.weight torch.Size([1792, 1792]) True 1070 model.dit.model.layers.12.attn2.to_out.0.weight torch.Size([1792, 1792]) True 1071 model.dit.model.layers.12.feed_forward.linear_1.weight torch.Size([4864, 1792]) True 1072 model.dit.model.layers.12.feed_forward.linear_2.weight torch.Size([1792, 4864]) True 1073 model.dit.model.layers.12.feed_forward.linear_3.weight torch.Size([4864, 1792]) True 1074 model.dit.model.layers.12.norm1.linear.weight torch.Size([7168, 1024]) True 1075 model.dit.model.layers.12.norm1.linear.bias torch.Size([7168]) True 1076 model.dit.model.layers.12.norm1.norm.weight torch.Size([1792]) True 1077 model.dit.model.layers.12.ffn_norm1.weight torch.Size([1792]) True 1078 model.dit.model.layers.12.norm2.weight torch.Size([1792]) True 1079 model.dit.model.layers.12.ffn_norm2.weight torch.Size([1792]) True 1080 model.dit.model.layers.12.norm1_context.weight torch.Size([1792]) True 1081 model.dit.model.layers.13.gate torch.Size([28]) True 1082 model.dit.model.layers.13.attn1.norm_q.weight torch.Size([1792]) True 1083 model.dit.model.layers.13.attn1.norm_q.bias torch.Size([1792]) True 1084 model.dit.model.layers.13.attn1.norm_k.weight torch.Size([1792]) True 1085 model.dit.model.layers.13.attn1.norm_k.bias torch.Size([1792]) True 1086 model.dit.model.layers.13.attn1.to_q.weight torch.Size([1792, 1792]) True 1087 model.dit.model.layers.13.attn1.to_k.weight torch.Size([1792, 1792]) True 1088 model.dit.model.layers.13.attn1.to_v.weight torch.Size([1792, 1792]) True 1089 model.dit.model.layers.13.attn2.norm_q.weight torch.Size([1792]) True 1090 model.dit.model.layers.13.attn2.norm_q.bias torch.Size([1792]) True 1091 model.dit.model.layers.13.attn2.norm_k.weight torch.Size([1792]) True 1092 model.dit.model.layers.13.attn2.norm_k.bias torch.Size([1792]) True 1093 model.dit.model.layers.13.attn2.to_q.weight torch.Size([1792, 1792]) True 1094 model.dit.model.layers.13.attn2.to_k.weight torch.Size([1792, 1792]) True 1095 model.dit.model.layers.13.attn2.to_v.weight torch.Size([1792, 1792]) True 1096 model.dit.model.layers.13.attn2.to_out.0.weight torch.Size([1792, 1792]) True 1097 model.dit.model.layers.13.feed_forward.linear_1.weight torch.Size([4864, 1792]) True 1098 model.dit.model.layers.13.feed_forward.linear_2.weight torch.Size([1792, 4864]) True 1099 model.dit.model.layers.13.feed_forward.linear_3.weight torch.Size([4864, 1792]) True 1100 model.dit.model.layers.13.norm1.linear.weight torch.Size([7168, 1024]) True 1101 model.dit.model.layers.13.norm1.linear.bias torch.Size([7168]) True 1102 model.dit.model.layers.13.norm1.norm.weight torch.Size([1792]) True 1103 model.dit.model.layers.13.ffn_norm1.weight torch.Size([1792]) True 1104 model.dit.model.layers.13.norm2.weight torch.Size([1792]) True 1105 model.dit.model.layers.13.ffn_norm2.weight torch.Size([1792]) True 1106 model.dit.model.layers.13.norm1_context.weight torch.Size([1792]) True 1107 model.dit.model.layers.14.gate torch.Size([28]) True 1108 model.dit.model.layers.14.attn1.norm_q.weight torch.Size([1792]) True 1109 model.dit.model.layers.14.attn1.norm_q.bias torch.Size([1792]) True 1110 model.dit.model.layers.14.attn1.norm_k.weight torch.Size([1792]) True 1111 model.dit.model.layers.14.attn1.norm_k.bias torch.Size([1792]) True 1112 model.dit.model.layers.14.attn1.to_q.weight torch.Size([1792, 1792]) True 1113 model.dit.model.layers.14.attn1.to_k.weight torch.Size([1792, 1792]) True 1114 model.dit.model.layers.14.attn1.to_v.weight torch.Size([1792, 1792]) True 1115 model.dit.model.layers.14.attn2.norm_q.weight torch.Size([1792]) True 1116 model.dit.model.layers.14.attn2.norm_q.bias torch.Size([1792]) True 1117 model.dit.model.layers.14.attn2.norm_k.weight torch.Size([1792]) True 1118 model.dit.model.layers.14.attn2.norm_k.bias torch.Size([1792]) True 1119 model.dit.model.layers.14.attn2.to_q.weight torch.Size([1792, 1792]) True 1120 model.dit.model.layers.14.attn2.to_k.weight torch.Size([1792, 1792]) True 1121 model.dit.model.layers.14.attn2.to_v.weight torch.Size([1792, 1792]) True 1122 model.dit.model.layers.14.attn2.to_out.0.weight torch.Size([1792, 1792]) True 1123 model.dit.model.layers.14.feed_forward.linear_1.weight torch.Size([4864, 1792]) True 1124 model.dit.model.layers.14.feed_forward.linear_2.weight torch.Size([1792, 4864]) True 1125 model.dit.model.layers.14.feed_forward.linear_3.weight torch.Size([4864, 1792]) True 1126 model.dit.model.layers.14.norm1.linear.weight torch.Size([7168, 1024]) True 1127 model.dit.model.layers.14.norm1.linear.bias torch.Size([7168]) True 1128 model.dit.model.layers.14.norm1.norm.weight torch.Size([1792]) True 1129 model.dit.model.layers.14.ffn_norm1.weight torch.Size([1792]) True 1130 model.dit.model.layers.14.norm2.weight torch.Size([1792]) True 1131 model.dit.model.layers.14.ffn_norm2.weight torch.Size([1792]) True 1132 model.dit.model.layers.14.norm1_context.weight torch.Size([1792]) True 1133 model.dit.model.layers.15.gate torch.Size([28]) True 1134 model.dit.model.layers.15.attn1.norm_q.weight torch.Size([1792]) True 1135 model.dit.model.layers.15.attn1.norm_q.bias torch.Size([1792]) True 1136 model.dit.model.layers.15.attn1.norm_k.weight torch.Size([1792]) True 1137 model.dit.model.layers.15.attn1.norm_k.bias torch.Size([1792]) True 1138 model.dit.model.layers.15.attn1.to_q.weight torch.Size([1792, 1792]) True 1139 model.dit.model.layers.15.attn1.to_k.weight torch.Size([1792, 1792]) True 1140 model.dit.model.layers.15.attn1.to_v.weight torch.Size([1792, 1792]) True 1141 model.dit.model.layers.15.attn2.norm_q.weight torch.Size([1792]) True 1142 model.dit.model.layers.15.attn2.norm_q.bias torch.Size([1792]) True 1143 model.dit.model.layers.15.attn2.norm_k.weight torch.Size([1792]) True 1144 model.dit.model.layers.15.attn2.norm_k.bias torch.Size([1792]) True 1145 model.dit.model.layers.15.attn2.to_q.weight torch.Size([1792, 1792]) True 1146 model.dit.model.layers.15.attn2.to_k.weight torch.Size([1792, 1792]) True 1147 model.dit.model.layers.15.attn2.to_v.weight torch.Size([1792, 1792]) True 1148 model.dit.model.layers.15.attn2.to_out.0.weight torch.Size([1792, 1792]) True 1149 model.dit.model.layers.15.feed_forward.linear_1.weight torch.Size([4864, 1792]) True 1150 model.dit.model.layers.15.feed_forward.linear_2.weight torch.Size([1792, 4864]) True 1151 model.dit.model.layers.15.feed_forward.linear_3.weight torch.Size([4864, 1792]) True 1152 model.dit.model.layers.15.norm1.linear.weight torch.Size([7168, 1024]) True 1153 model.dit.model.layers.15.norm1.linear.bias torch.Size([7168]) True 1154 model.dit.model.layers.15.norm1.norm.weight torch.Size([1792]) True 1155 model.dit.model.layers.15.ffn_norm1.weight torch.Size([1792]) True 1156 model.dit.model.layers.15.norm2.weight torch.Size([1792]) True 1157 model.dit.model.layers.15.ffn_norm2.weight torch.Size([1792]) True 1158 model.dit.model.layers.15.norm1_context.weight torch.Size([1792]) True 1159 model.dit.model.layers.16.gate torch.Size([28]) True 1160 model.dit.model.layers.16.attn1.norm_q.weight torch.Size([1792]) True 1161 model.dit.model.layers.16.attn1.norm_q.bias torch.Size([1792]) True 1162 model.dit.model.layers.16.attn1.norm_k.weight torch.Size([1792]) True 1163 model.dit.model.layers.16.attn1.norm_k.bias torch.Size([1792]) True 1164 model.dit.model.layers.16.attn1.to_q.weight torch.Size([1792, 1792]) True 1165 model.dit.model.layers.16.attn1.to_k.weight torch.Size([1792, 1792]) True 1166 model.dit.model.layers.16.attn1.to_v.weight torch.Size([1792, 1792]) True 1167 model.dit.model.layers.16.attn2.norm_q.weight torch.Size([1792]) True 1168 model.dit.model.layers.16.attn2.norm_q.bias torch.Size([1792]) True 1169 model.dit.model.layers.16.attn2.norm_k.weight torch.Size([1792]) True 1170 model.dit.model.layers.16.attn2.norm_k.bias torch.Size([1792]) True 1171 model.dit.model.layers.16.attn2.to_q.weight torch.Size([1792, 1792]) True 1172 model.dit.model.layers.16.attn2.to_k.weight torch.Size([1792, 1792]) True 1173 model.dit.model.layers.16.attn2.to_v.weight torch.Size([1792, 1792]) True 1174 model.dit.model.layers.16.attn2.to_out.0.weight torch.Size([1792, 1792]) True 1175 model.dit.model.layers.16.feed_forward.linear_1.weight torch.Size([4864, 1792]) True 1176 model.dit.model.layers.16.feed_forward.linear_2.weight torch.Size([1792, 4864]) True 1177 model.dit.model.layers.16.feed_forward.linear_3.weight torch.Size([4864, 1792]) True 1178 model.dit.model.layers.16.norm1.linear.weight torch.Size([7168, 1024]) True 1179 model.dit.model.layers.16.norm1.linear.bias torch.Size([7168]) True 1180 model.dit.model.layers.16.norm1.norm.weight torch.Size([1792]) True 1181 model.dit.model.layers.16.ffn_norm1.weight torch.Size([1792]) True 1182 model.dit.model.layers.16.norm2.weight torch.Size([1792]) True 1183 model.dit.model.layers.16.ffn_norm2.weight torch.Size([1792]) True 1184 model.dit.model.layers.16.norm1_context.weight torch.Size([1792]) True 1185 model.dit.model.layers.17.gate torch.Size([28]) True 1186 model.dit.model.layers.17.attn1.norm_q.weight torch.Size([1792]) True 1187 model.dit.model.layers.17.attn1.norm_q.bias torch.Size([1792]) True 1188 model.dit.model.layers.17.attn1.norm_k.weight torch.Size([1792]) True 1189 model.dit.model.layers.17.attn1.norm_k.bias torch.Size([1792]) True 1190 model.dit.model.layers.17.attn1.to_q.weight torch.Size([1792, 1792]) True 1191 model.dit.model.layers.17.attn1.to_k.weight torch.Size([1792, 1792]) True 1192 model.dit.model.layers.17.attn1.to_v.weight torch.Size([1792, 1792]) True 1193 model.dit.model.layers.17.attn2.norm_q.weight torch.Size([1792]) True 1194 model.dit.model.layers.17.attn2.norm_q.bias torch.Size([1792]) True 1195 model.dit.model.layers.17.attn2.norm_k.weight torch.Size([1792]) True 1196 model.dit.model.layers.17.attn2.norm_k.bias torch.Size([1792]) True 1197 model.dit.model.layers.17.attn2.to_q.weight torch.Size([1792, 1792]) True 1198 model.dit.model.layers.17.attn2.to_k.weight torch.Size([1792, 1792]) True 1199 model.dit.model.layers.17.attn2.to_v.weight torch.Size([1792, 1792]) True 1200 model.dit.model.layers.17.attn2.to_out.0.weight torch.Size([1792, 1792]) True 1201 model.dit.model.layers.17.feed_forward.linear_1.weight torch.Size([4864, 1792]) True 1202 model.dit.model.layers.17.feed_forward.linear_2.weight torch.Size([1792, 4864]) True 1203 model.dit.model.layers.17.feed_forward.linear_3.weight torch.Size([4864, 1792]) True 1204 model.dit.model.layers.17.norm1.linear.weight torch.Size([7168, 1024]) True 1205 model.dit.model.layers.17.norm1.linear.bias torch.Size([7168]) True 1206 model.dit.model.layers.17.norm1.norm.weight torch.Size([1792]) True 1207 model.dit.model.layers.17.ffn_norm1.weight torch.Size([1792]) True 1208 model.dit.model.layers.17.norm2.weight torch.Size([1792]) True 1209 model.dit.model.layers.17.ffn_norm2.weight torch.Size([1792]) True 1210 model.dit.model.layers.17.norm1_context.weight torch.Size([1792]) True 1211 model.dit.model.layers.18.gate torch.Size([28]) True 1212 model.dit.model.layers.18.attn1.norm_q.weight torch.Size([1792]) True 1213 model.dit.model.layers.18.attn1.norm_q.bias torch.Size([1792]) True 1214 model.dit.model.layers.18.attn1.norm_k.weight torch.Size([1792]) True 1215 model.dit.model.layers.18.attn1.norm_k.bias torch.Size([1792]) True 1216 model.dit.model.layers.18.attn1.to_q.weight torch.Size([1792, 1792]) True 1217 model.dit.model.layers.18.attn1.to_k.weight torch.Size([1792, 1792]) True 1218 model.dit.model.layers.18.attn1.to_v.weight torch.Size([1792, 1792]) True 1219 model.dit.model.layers.18.attn2.norm_q.weight torch.Size([1792]) True 1220 model.dit.model.layers.18.attn2.norm_q.bias torch.Size([1792]) True 1221 model.dit.model.layers.18.attn2.norm_k.weight torch.Size([1792]) True 1222 model.dit.model.layers.18.attn2.norm_k.bias torch.Size([1792]) True 1223 model.dit.model.layers.18.attn2.to_q.weight torch.Size([1792, 1792]) True 1224 model.dit.model.layers.18.attn2.to_k.weight torch.Size([1792, 1792]) True 1225 model.dit.model.layers.18.attn2.to_v.weight torch.Size([1792, 1792]) True 1226 model.dit.model.layers.18.attn2.to_out.0.weight torch.Size([1792, 1792]) True 1227 model.dit.model.layers.18.feed_forward.linear_1.weight torch.Size([4864, 1792]) True 1228 model.dit.model.layers.18.feed_forward.linear_2.weight torch.Size([1792, 4864]) True 1229 model.dit.model.layers.18.feed_forward.linear_3.weight torch.Size([4864, 1792]) True 1230 model.dit.model.layers.18.norm1.linear.weight torch.Size([7168, 1024]) True 1231 model.dit.model.layers.18.norm1.linear.bias torch.Size([7168]) True 1232 model.dit.model.layers.18.norm1.norm.weight torch.Size([1792]) True 1233 model.dit.model.layers.18.ffn_norm1.weight torch.Size([1792]) True 1234 model.dit.model.layers.18.norm2.weight torch.Size([1792]) True 1235 model.dit.model.layers.18.ffn_norm2.weight torch.Size([1792]) True 1236 model.dit.model.layers.18.norm1_context.weight torch.Size([1792]) True 1237 model.dit.model.layers.19.gate torch.Size([28]) True 1238 model.dit.model.layers.19.attn1.norm_q.weight torch.Size([1792]) True 1239 model.dit.model.layers.19.attn1.norm_q.bias torch.Size([1792]) True 1240 model.dit.model.layers.19.attn1.norm_k.weight torch.Size([1792]) True 1241 model.dit.model.layers.19.attn1.norm_k.bias torch.Size([1792]) True 1242 model.dit.model.layers.19.attn1.to_q.weight torch.Size([1792, 1792]) True 1243 model.dit.model.layers.19.attn1.to_k.weight torch.Size([1792, 1792]) True 1244 model.dit.model.layers.19.attn1.to_v.weight torch.Size([1792, 1792]) True 1245 model.dit.model.layers.19.attn2.norm_q.weight torch.Size([1792]) True 1246 model.dit.model.layers.19.attn2.norm_q.bias torch.Size([1792]) True 1247 model.dit.model.layers.19.attn2.norm_k.weight torch.Size([1792]) True 1248 model.dit.model.layers.19.attn2.norm_k.bias torch.Size([1792]) True 1249 model.dit.model.layers.19.attn2.to_q.weight torch.Size([1792, 1792]) True 1250 model.dit.model.layers.19.attn2.to_k.weight torch.Size([1792, 1792]) True 1251 model.dit.model.layers.19.attn2.to_v.weight torch.Size([1792, 1792]) True 1252 model.dit.model.layers.19.attn2.to_out.0.weight torch.Size([1792, 1792]) True 1253 model.dit.model.layers.19.feed_forward.linear_1.weight torch.Size([4864, 1792]) True 1254 model.dit.model.layers.19.feed_forward.linear_2.weight torch.Size([1792, 4864]) True 1255 model.dit.model.layers.19.feed_forward.linear_3.weight torch.Size([4864, 1792]) True 1256 model.dit.model.layers.19.norm1.linear.weight torch.Size([7168, 1024]) True 1257 model.dit.model.layers.19.norm1.linear.bias torch.Size([7168]) True 1258 model.dit.model.layers.19.norm1.norm.weight torch.Size([1792]) True 1259 model.dit.model.layers.19.ffn_norm1.weight torch.Size([1792]) True 1260 model.dit.model.layers.19.norm2.weight torch.Size([1792]) True 1261 model.dit.model.layers.19.ffn_norm2.weight torch.Size([1792]) True 1262 model.dit.model.layers.19.norm1_context.weight torch.Size([1792]) True 1263 model.dit.model.layers.20.gate torch.Size([28]) True 1264 model.dit.model.layers.20.attn1.norm_q.weight torch.Size([1792]) True 1265 model.dit.model.layers.20.attn1.norm_q.bias torch.Size([1792]) True 1266 model.dit.model.layers.20.attn1.norm_k.weight torch.Size([1792]) True 1267 model.dit.model.layers.20.attn1.norm_k.bias torch.Size([1792]) True 1268 model.dit.model.layers.20.attn1.to_q.weight torch.Size([1792, 1792]) True 1269 model.dit.model.layers.20.attn1.to_k.weight torch.Size([1792, 1792]) True 1270 model.dit.model.layers.20.attn1.to_v.weight torch.Size([1792, 1792]) True 1271 model.dit.model.layers.20.attn2.norm_q.weight torch.Size([1792]) True 1272 model.dit.model.layers.20.attn2.norm_q.bias torch.Size([1792]) True 1273 model.dit.model.layers.20.attn2.norm_k.weight torch.Size([1792]) True 1274 model.dit.model.layers.20.attn2.norm_k.bias torch.Size([1792]) True 1275 model.dit.model.layers.20.attn2.to_q.weight torch.Size([1792, 1792]) True 1276 model.dit.model.layers.20.attn2.to_k.weight torch.Size([1792, 1792]) True 1277 model.dit.model.layers.20.attn2.to_v.weight torch.Size([1792, 1792]) True 1278 model.dit.model.layers.20.attn2.to_out.0.weight torch.Size([1792, 1792]) True 1279 model.dit.model.layers.20.feed_forward.linear_1.weight torch.Size([4864, 1792]) True 1280 model.dit.model.layers.20.feed_forward.linear_2.weight torch.Size([1792, 4864]) True 1281 model.dit.model.layers.20.feed_forward.linear_3.weight torch.Size([4864, 1792]) True 1282 model.dit.model.layers.20.norm1.linear.weight torch.Size([7168, 1024]) True 1283 model.dit.model.layers.20.norm1.linear.bias torch.Size([7168]) True 1284 model.dit.model.layers.20.norm1.norm.weight torch.Size([1792]) True 1285 model.dit.model.layers.20.ffn_norm1.weight torch.Size([1792]) True 1286 model.dit.model.layers.20.norm2.weight torch.Size([1792]) True 1287 model.dit.model.layers.20.ffn_norm2.weight torch.Size([1792]) True 1288 model.dit.model.layers.20.norm1_context.weight torch.Size([1792]) True 1289 model.dit.model.layers.21.gate torch.Size([28]) True 1290 model.dit.model.layers.21.attn1.norm_q.weight torch.Size([1792]) True 1291 model.dit.model.layers.21.attn1.norm_q.bias torch.Size([1792]) True 1292 model.dit.model.layers.21.attn1.norm_k.weight torch.Size([1792]) True 1293 model.dit.model.layers.21.attn1.norm_k.bias torch.Size([1792]) True 1294 model.dit.model.layers.21.attn1.to_q.weight torch.Size([1792, 1792]) True 1295 model.dit.model.layers.21.attn1.to_k.weight torch.Size([1792, 1792]) True 1296 model.dit.model.layers.21.attn1.to_v.weight torch.Size([1792, 1792]) True 1297 model.dit.model.layers.21.attn2.norm_q.weight torch.Size([1792]) True 1298 model.dit.model.layers.21.attn2.norm_q.bias torch.Size([1792]) True 1299 model.dit.model.layers.21.attn2.norm_k.weight torch.Size([1792]) True 1300 model.dit.model.layers.21.attn2.norm_k.bias torch.Size([1792]) True 1301 model.dit.model.layers.21.attn2.to_q.weight torch.Size([1792, 1792]) True 1302 model.dit.model.layers.21.attn2.to_k.weight torch.Size([1792, 1792]) True 1303 model.dit.model.layers.21.attn2.to_v.weight torch.Size([1792, 1792]) True 1304 model.dit.model.layers.21.attn2.to_out.0.weight torch.Size([1792, 1792]) True 1305 model.dit.model.layers.21.feed_forward.linear_1.weight torch.Size([4864, 1792]) True 1306 model.dit.model.layers.21.feed_forward.linear_2.weight torch.Size([1792, 4864]) True 1307 model.dit.model.layers.21.feed_forward.linear_3.weight torch.Size([4864, 1792]) True 1308 model.dit.model.layers.21.norm1.linear.weight torch.Size([7168, 1024]) True 1309 model.dit.model.layers.21.norm1.linear.bias torch.Size([7168]) True 1310 model.dit.model.layers.21.norm1.norm.weight torch.Size([1792]) True 1311 model.dit.model.layers.21.ffn_norm1.weight torch.Size([1792]) True 1312 model.dit.model.layers.21.norm2.weight torch.Size([1792]) True 1313 model.dit.model.layers.21.ffn_norm2.weight torch.Size([1792]) True 1314 model.dit.model.layers.21.norm1_context.weight torch.Size([1792]) True 1315 model.dit.model.layers.22.gate torch.Size([28]) True 1316 model.dit.model.layers.22.attn1.norm_q.weight torch.Size([1792]) True 1317 model.dit.model.layers.22.attn1.norm_q.bias torch.Size([1792]) True 1318 model.dit.model.layers.22.attn1.norm_k.weight torch.Size([1792]) True 1319 model.dit.model.layers.22.attn1.norm_k.bias torch.Size([1792]) True 1320 model.dit.model.layers.22.attn1.to_q.weight torch.Size([1792, 1792]) True 1321 model.dit.model.layers.22.attn1.to_k.weight torch.Size([1792, 1792]) True 1322 model.dit.model.layers.22.attn1.to_v.weight torch.Size([1792, 1792]) True 1323 model.dit.model.layers.22.attn2.norm_q.weight torch.Size([1792]) True 1324 model.dit.model.layers.22.attn2.norm_q.bias torch.Size([1792]) True 1325 model.dit.model.layers.22.attn2.norm_k.weight torch.Size([1792]) True 1326 model.dit.model.layers.22.attn2.norm_k.bias torch.Size([1792]) True 1327 model.dit.model.layers.22.attn2.to_q.weight torch.Size([1792, 1792]) True 1328 model.dit.model.layers.22.attn2.to_k.weight torch.Size([1792, 1792]) True 1329 model.dit.model.layers.22.attn2.to_v.weight torch.Size([1792, 1792]) True 1330 model.dit.model.layers.22.attn2.to_out.0.weight torch.Size([1792, 1792]) True 1331 model.dit.model.layers.22.feed_forward.linear_1.weight torch.Size([4864, 1792]) True 1332 model.dit.model.layers.22.feed_forward.linear_2.weight torch.Size([1792, 4864]) True 1333 model.dit.model.layers.22.feed_forward.linear_3.weight torch.Size([4864, 1792]) True 1334 model.dit.model.layers.22.norm1.linear.weight torch.Size([7168, 1024]) True 1335 model.dit.model.layers.22.norm1.linear.bias torch.Size([7168]) True 1336 model.dit.model.layers.22.norm1.norm.weight torch.Size([1792]) True 1337 model.dit.model.layers.22.ffn_norm1.weight torch.Size([1792]) True 1338 model.dit.model.layers.22.norm2.weight torch.Size([1792]) True 1339 model.dit.model.layers.22.ffn_norm2.weight torch.Size([1792]) True 1340 model.dit.model.layers.22.norm1_context.weight torch.Size([1792]) True 1341 model.dit.model.layers.23.gate torch.Size([28]) True 1342 model.dit.model.layers.23.attn1.norm_q.weight torch.Size([1792]) True 1343 model.dit.model.layers.23.attn1.norm_q.bias torch.Size([1792]) True 1344 model.dit.model.layers.23.attn1.norm_k.weight torch.Size([1792]) True 1345 model.dit.model.layers.23.attn1.norm_k.bias torch.Size([1792]) True 1346 model.dit.model.layers.23.attn1.to_q.weight torch.Size([1792, 1792]) True 1347 model.dit.model.layers.23.attn1.to_k.weight torch.Size([1792, 1792]) True 1348 model.dit.model.layers.23.attn1.to_v.weight torch.Size([1792, 1792]) True 1349 model.dit.model.layers.23.attn2.norm_q.weight torch.Size([1792]) True 1350 model.dit.model.layers.23.attn2.norm_q.bias torch.Size([1792]) True 1351 model.dit.model.layers.23.attn2.norm_k.weight torch.Size([1792]) True 1352 model.dit.model.layers.23.attn2.norm_k.bias torch.Size([1792]) True 1353 model.dit.model.layers.23.attn2.to_q.weight torch.Size([1792, 1792]) True 1354 model.dit.model.layers.23.attn2.to_k.weight torch.Size([1792, 1792]) True 1355 model.dit.model.layers.23.attn2.to_v.weight torch.Size([1792, 1792]) True 1356 model.dit.model.layers.23.attn2.to_out.0.weight torch.Size([1792, 1792]) True 1357 model.dit.model.layers.23.feed_forward.linear_1.weight torch.Size([4864, 1792]) True 1358 model.dit.model.layers.23.feed_forward.linear_2.weight torch.Size([1792, 4864]) True 1359 model.dit.model.layers.23.feed_forward.linear_3.weight torch.Size([4864, 1792]) True 1360 model.dit.model.layers.23.norm1.linear.weight torch.Size([7168, 1024]) True 1361 model.dit.model.layers.23.norm1.linear.bias torch.Size([7168]) True 1362 model.dit.model.layers.23.norm1.norm.weight torch.Size([1792]) True 1363 model.dit.model.layers.23.ffn_norm1.weight torch.Size([1792]) True 1364 model.dit.model.layers.23.norm2.weight torch.Size([1792]) True 1365 model.dit.model.layers.23.ffn_norm2.weight torch.Size([1792]) True 1366 model.dit.model.layers.23.norm1_context.weight torch.Size([1792]) True 1367 model.dit.model.norm_out.linear_1.weight torch.Size([1792, 1024]) True 1368 model.dit.model.norm_out.linear_1.bias torch.Size([1792]) True 1369 model.dit.model.norm_out.linear_2.weight torch.Size([1792, 1792]) True 1370 model.dit.model.norm_out.linear_2.bias torch.Size([1792]) True 1371 model.vae.encoder.conv_in.weight torch.Size([128, 3, 3, 3]) False 1372 model.vae.encoder.conv_in.bias torch.Size([128]) False 1373 model.vae.encoder.down_blocks.0.resnets.0.norm1.weight torch.Size([128]) False 1374 model.vae.encoder.down_blocks.0.resnets.0.norm1.bias torch.Size([128]) False 1375 model.vae.encoder.down_blocks.0.resnets.0.conv1.weight torch.Size([128, 128, 3, 3]) False 1376 model.vae.encoder.down_blocks.0.resnets.0.conv1.bias torch.Size([128]) False 1377 model.vae.encoder.down_blocks.0.resnets.0.norm2.weight torch.Size([128]) False 1378 model.vae.encoder.down_blocks.0.resnets.0.norm2.bias torch.Size([128]) False 1379 model.vae.encoder.down_blocks.0.resnets.0.conv2.weight torch.Size([128, 128, 3, 3]) False 1380 model.vae.encoder.down_blocks.0.resnets.0.conv2.bias torch.Size([128]) False 1381 model.vae.encoder.down_blocks.0.resnets.1.norm1.weight torch.Size([128]) False 1382 model.vae.encoder.down_blocks.0.resnets.1.norm1.bias torch.Size([128]) False 1383 model.vae.encoder.down_blocks.0.resnets.1.conv1.weight torch.Size([128, 128, 3, 3]) False 1384 model.vae.encoder.down_blocks.0.resnets.1.conv1.bias torch.Size([128]) False 1385 model.vae.encoder.down_blocks.0.resnets.1.norm2.weight torch.Size([128]) False 1386 model.vae.encoder.down_blocks.0.resnets.1.norm2.bias torch.Size([128]) False 1387 model.vae.encoder.down_blocks.0.resnets.1.conv2.weight torch.Size([128, 128, 3, 3]) False 1388 model.vae.encoder.down_blocks.0.resnets.1.conv2.bias torch.Size([128]) False 1389 model.vae.encoder.down_blocks.0.downsamplers.0.conv.weight torch.Size([128, 128, 3, 3]) False 1390 model.vae.encoder.down_blocks.0.downsamplers.0.conv.bias torch.Size([128]) False 1391 model.vae.encoder.down_blocks.1.resnets.0.norm1.weight torch.Size([128]) False 1392 model.vae.encoder.down_blocks.1.resnets.0.norm1.bias torch.Size([128]) False 1393 model.vae.encoder.down_blocks.1.resnets.0.conv1.weight torch.Size([256, 128, 3, 3]) False 1394 model.vae.encoder.down_blocks.1.resnets.0.conv1.bias torch.Size([256]) False 1395 model.vae.encoder.down_blocks.1.resnets.0.norm2.weight torch.Size([256]) False 1396 model.vae.encoder.down_blocks.1.resnets.0.norm2.bias torch.Size([256]) False 1397 model.vae.encoder.down_blocks.1.resnets.0.conv2.weight torch.Size([256, 256, 3, 3]) False 1398 model.vae.encoder.down_blocks.1.resnets.0.conv2.bias torch.Size([256]) False 1399 model.vae.encoder.down_blocks.1.resnets.0.conv_shortcut.weight torch.Size([256, 128, 1, 1]) False 1400 model.vae.encoder.down_blocks.1.resnets.0.conv_shortcut.bias torch.Size([256]) False 1401 model.vae.encoder.down_blocks.1.resnets.1.norm1.weight torch.Size([256]) False 1402 model.vae.encoder.down_blocks.1.resnets.1.norm1.bias torch.Size([256]) False 1403 model.vae.encoder.down_blocks.1.resnets.1.conv1.weight torch.Size([256, 256, 3, 3]) False 1404 model.vae.encoder.down_blocks.1.resnets.1.conv1.bias torch.Size([256]) False 1405 model.vae.encoder.down_blocks.1.resnets.1.norm2.weight torch.Size([256]) False 1406 model.vae.encoder.down_blocks.1.resnets.1.norm2.bias torch.Size([256]) False 1407 model.vae.encoder.down_blocks.1.resnets.1.conv2.weight torch.Size([256, 256, 3, 3]) False 1408 model.vae.encoder.down_blocks.1.resnets.1.conv2.bias torch.Size([256]) False 1409 model.vae.encoder.down_blocks.1.downsamplers.0.conv.weight torch.Size([256, 256, 3, 3]) False 1410 model.vae.encoder.down_blocks.1.downsamplers.0.conv.bias torch.Size([256]) False 1411 model.vae.encoder.down_blocks.2.resnets.0.norm1.weight torch.Size([256]) False 1412 model.vae.encoder.down_blocks.2.resnets.0.norm1.bias torch.Size([256]) False 1413 model.vae.encoder.down_blocks.2.resnets.0.conv1.weight torch.Size([512, 256, 3, 3]) False 1414 model.vae.encoder.down_blocks.2.resnets.0.conv1.bias torch.Size([512]) False 1415 model.vae.encoder.down_blocks.2.resnets.0.norm2.weight torch.Size([512]) False 1416 model.vae.encoder.down_blocks.2.resnets.0.norm2.bias torch.Size([512]) False 1417 model.vae.encoder.down_blocks.2.resnets.0.conv2.weight torch.Size([512, 512, 3, 3]) False 1418 model.vae.encoder.down_blocks.2.resnets.0.conv2.bias torch.Size([512]) False 1419 model.vae.encoder.down_blocks.2.resnets.0.conv_shortcut.weight torch.Size([512, 256, 1, 1]) False 1420 model.vae.encoder.down_blocks.2.resnets.0.conv_shortcut.bias torch.Size([512]) False 1421 model.vae.encoder.down_blocks.2.resnets.1.norm1.weight torch.Size([512]) False 1422 model.vae.encoder.down_blocks.2.resnets.1.norm1.bias torch.Size([512]) False 1423 model.vae.encoder.down_blocks.2.resnets.1.conv1.weight torch.Size([512, 512, 3, 3]) False 1424 model.vae.encoder.down_blocks.2.resnets.1.conv1.bias torch.Size([512]) False 1425 model.vae.encoder.down_blocks.2.resnets.1.norm2.weight torch.Size([512]) False 1426 model.vae.encoder.down_blocks.2.resnets.1.norm2.bias torch.Size([512]) False 1427 model.vae.encoder.down_blocks.2.resnets.1.conv2.weight torch.Size([512, 512, 3, 3]) False 1428 model.vae.encoder.down_blocks.2.resnets.1.conv2.bias torch.Size([512]) False 1429 model.vae.encoder.down_blocks.2.downsamplers.0.conv.weight torch.Size([512, 512, 3, 3]) False 1430 model.vae.encoder.down_blocks.2.downsamplers.0.conv.bias torch.Size([512]) False 1431 model.vae.encoder.down_blocks.3.resnets.0.norm1.weight torch.Size([512]) False 1432 model.vae.encoder.down_blocks.3.resnets.0.norm1.bias torch.Size([512]) False 1433 model.vae.encoder.down_blocks.3.resnets.0.conv1.weight torch.Size([512, 512, 3, 3]) False 1434 model.vae.encoder.down_blocks.3.resnets.0.conv1.bias torch.Size([512]) False 1435 model.vae.encoder.down_blocks.3.resnets.0.norm2.weight torch.Size([512]) False 1436 model.vae.encoder.down_blocks.3.resnets.0.norm2.bias torch.Size([512]) False 1437 model.vae.encoder.down_blocks.3.resnets.0.conv2.weight torch.Size([512, 512, 3, 3]) False 1438 model.vae.encoder.down_blocks.3.resnets.0.conv2.bias torch.Size([512]) False 1439 model.vae.encoder.down_blocks.3.resnets.1.norm1.weight torch.Size([512]) False 1440 model.vae.encoder.down_blocks.3.resnets.1.norm1.bias torch.Size([512]) False 1441 model.vae.encoder.down_blocks.3.resnets.1.conv1.weight torch.Size([512, 512, 3, 3]) False 1442 model.vae.encoder.down_blocks.3.resnets.1.conv1.bias torch.Size([512]) False 1443 model.vae.encoder.down_blocks.3.resnets.1.norm2.weight torch.Size([512]) False 1444 model.vae.encoder.down_blocks.3.resnets.1.norm2.bias torch.Size([512]) False 1445 model.vae.encoder.down_blocks.3.resnets.1.conv2.weight torch.Size([512, 512, 3, 3]) False 1446 model.vae.encoder.down_blocks.3.resnets.1.conv2.bias torch.Size([512]) False 1447 model.vae.encoder.mid_block.attentions.0.group_norm.weight torch.Size([512]) False 1448 model.vae.encoder.mid_block.attentions.0.group_norm.bias torch.Size([512]) False 1449 model.vae.encoder.mid_block.attentions.0.to_q.weight torch.Size([512, 512]) False 1450 model.vae.encoder.mid_block.attentions.0.to_q.bias torch.Size([512]) False 1451 model.vae.encoder.mid_block.attentions.0.to_k.weight torch.Size([512, 512]) False 1452 model.vae.encoder.mid_block.attentions.0.to_k.bias torch.Size([512]) False 1453 model.vae.encoder.mid_block.attentions.0.to_v.weight torch.Size([512, 512]) False 1454 model.vae.encoder.mid_block.attentions.0.to_v.bias torch.Size([512]) False 1455 model.vae.encoder.mid_block.attentions.0.to_out.0.weight torch.Size([512, 512]) False 1456 model.vae.encoder.mid_block.attentions.0.to_out.0.bias torch.Size([512]) False 1457 model.vae.encoder.mid_block.resnets.0.norm1.weight torch.Size([512]) False 1458 model.vae.encoder.mid_block.resnets.0.norm1.bias torch.Size([512]) False 1459 model.vae.encoder.mid_block.resnets.0.conv1.weight torch.Size([512, 512, 3, 3]) False 1460 model.vae.encoder.mid_block.resnets.0.conv1.bias torch.Size([512]) False 1461 model.vae.encoder.mid_block.resnets.0.norm2.weight torch.Size([512]) False 1462 model.vae.encoder.mid_block.resnets.0.norm2.bias torch.Size([512]) False 1463 model.vae.encoder.mid_block.resnets.0.conv2.weight torch.Size([512, 512, 3, 3]) False 1464 model.vae.encoder.mid_block.resnets.0.conv2.bias torch.Size([512]) False 1465 model.vae.encoder.mid_block.resnets.1.norm1.weight torch.Size([512]) False 1466 model.vae.encoder.mid_block.resnets.1.norm1.bias torch.Size([512]) False 1467 model.vae.encoder.mid_block.resnets.1.conv1.weight torch.Size([512, 512, 3, 3]) False 1468 model.vae.encoder.mid_block.resnets.1.conv1.bias torch.Size([512]) False 1469 model.vae.encoder.mid_block.resnets.1.norm2.weight torch.Size([512]) False 1470 model.vae.encoder.mid_block.resnets.1.norm2.bias torch.Size([512]) False 1471 model.vae.encoder.mid_block.resnets.1.conv2.weight torch.Size([512, 512, 3, 3]) False 1472 model.vae.encoder.mid_block.resnets.1.conv2.bias torch.Size([512]) False 1473 model.vae.encoder.conv_norm_out.weight torch.Size([512]) False 1474 model.vae.encoder.conv_norm_out.bias torch.Size([512]) False 1475 model.vae.encoder.conv_out.weight torch.Size([32, 512, 3, 3]) False 1476 model.vae.encoder.conv_out.bias torch.Size([32]) False 1477 model.vae.decoder.conv_in.weight torch.Size([512, 16, 3, 3]) False 1478 model.vae.decoder.conv_in.bias torch.Size([512]) False 1479 model.vae.decoder.up_blocks.0.resnets.0.norm1.weight torch.Size([512]) False 1480 model.vae.decoder.up_blocks.0.resnets.0.norm1.bias torch.Size([512]) False 1481 model.vae.decoder.up_blocks.0.resnets.0.conv1.weight torch.Size([512, 512, 3, 3]) False 1482 model.vae.decoder.up_blocks.0.resnets.0.conv1.bias torch.Size([512]) False 1483 model.vae.decoder.up_blocks.0.resnets.0.norm2.weight torch.Size([512]) False 1484 model.vae.decoder.up_blocks.0.resnets.0.norm2.bias torch.Size([512]) False 1485 model.vae.decoder.up_blocks.0.resnets.0.conv2.weight torch.Size([512, 512, 3, 3]) False 1486 model.vae.decoder.up_blocks.0.resnets.0.conv2.bias torch.Size([512]) False 1487 model.vae.decoder.up_blocks.0.resnets.1.norm1.weight torch.Size([512]) False 1488 model.vae.decoder.up_blocks.0.resnets.1.norm1.bias torch.Size([512]) False 1489 model.vae.decoder.up_blocks.0.resnets.1.conv1.weight torch.Size([512, 512, 3, 3]) False 1490 model.vae.decoder.up_blocks.0.resnets.1.conv1.bias torch.Size([512]) False 1491 model.vae.decoder.up_blocks.0.resnets.1.norm2.weight torch.Size([512]) False 1492 model.vae.decoder.up_blocks.0.resnets.1.norm2.bias torch.Size([512]) False 1493 model.vae.decoder.up_blocks.0.resnets.1.conv2.weight torch.Size([512, 512, 3, 3]) False 1494 model.vae.decoder.up_blocks.0.resnets.1.conv2.bias torch.Size([512]) False 1495 model.vae.decoder.up_blocks.0.resnets.2.norm1.weight torch.Size([512]) False 1496 model.vae.decoder.up_blocks.0.resnets.2.norm1.bias torch.Size([512]) False 1497 model.vae.decoder.up_blocks.0.resnets.2.conv1.weight torch.Size([512, 512, 3, 3]) False 1498 model.vae.decoder.up_blocks.0.resnets.2.conv1.bias torch.Size([512]) False 1499 model.vae.decoder.up_blocks.0.resnets.2.norm2.weight torch.Size([512]) False 1500 model.vae.decoder.up_blocks.0.resnets.2.norm2.bias torch.Size([512]) False 1501 model.vae.decoder.up_blocks.0.resnets.2.conv2.weight torch.Size([512, 512, 3, 3]) False 1502 model.vae.decoder.up_blocks.0.resnets.2.conv2.bias torch.Size([512]) False 1503 model.vae.decoder.up_blocks.0.upsamplers.0.conv.weight torch.Size([512, 512, 3, 3]) False 1504 model.vae.decoder.up_blocks.0.upsamplers.0.conv.bias torch.Size([512]) False 1505 model.vae.decoder.up_blocks.1.resnets.0.norm1.weight torch.Size([512]) False 1506 model.vae.decoder.up_blocks.1.resnets.0.norm1.bias torch.Size([512]) False 1507 model.vae.decoder.up_blocks.1.resnets.0.conv1.weight torch.Size([512, 512, 3, 3]) False 1508 model.vae.decoder.up_blocks.1.resnets.0.conv1.bias torch.Size([512]) False 1509 model.vae.decoder.up_blocks.1.resnets.0.norm2.weight torch.Size([512]) False 1510 model.vae.decoder.up_blocks.1.resnets.0.norm2.bias torch.Size([512]) False 1511 model.vae.decoder.up_blocks.1.resnets.0.conv2.weight torch.Size([512, 512, 3, 3]) False 1512 model.vae.decoder.up_blocks.1.resnets.0.conv2.bias torch.Size([512]) False 1513 model.vae.decoder.up_blocks.1.resnets.1.norm1.weight torch.Size([512]) False 1514 model.vae.decoder.up_blocks.1.resnets.1.norm1.bias torch.Size([512]) False 1515 model.vae.decoder.up_blocks.1.resnets.1.conv1.weight torch.Size([512, 512, 3, 3]) False 1516 model.vae.decoder.up_blocks.1.resnets.1.conv1.bias torch.Size([512]) False 1517 model.vae.decoder.up_blocks.1.resnets.1.norm2.weight torch.Size([512]) False 1518 model.vae.decoder.up_blocks.1.resnets.1.norm2.bias torch.Size([512]) False 1519 model.vae.decoder.up_blocks.1.resnets.1.conv2.weight torch.Size([512, 512, 3, 3]) False 1520 model.vae.decoder.up_blocks.1.resnets.1.conv2.bias torch.Size([512]) False 1521 model.vae.decoder.up_blocks.1.resnets.2.norm1.weight torch.Size([512]) False 1522 model.vae.decoder.up_blocks.1.resnets.2.norm1.bias torch.Size([512]) False 1523 model.vae.decoder.up_blocks.1.resnets.2.conv1.weight torch.Size([512, 512, 3, 3]) False 1524 model.vae.decoder.up_blocks.1.resnets.2.conv1.bias torch.Size([512]) False 1525 model.vae.decoder.up_blocks.1.resnets.2.norm2.weight torch.Size([512]) False 1526 model.vae.decoder.up_blocks.1.resnets.2.norm2.bias torch.Size([512]) False 1527 model.vae.decoder.up_blocks.1.resnets.2.conv2.weight torch.Size([512, 512, 3, 3]) False 1528 model.vae.decoder.up_blocks.1.resnets.2.conv2.bias torch.Size([512]) False 1529 model.vae.decoder.up_blocks.1.upsamplers.0.conv.weight torch.Size([512, 512, 3, 3]) False 1530 model.vae.decoder.up_blocks.1.upsamplers.0.conv.bias torch.Size([512]) False 1531 model.vae.decoder.up_blocks.2.resnets.0.norm1.weight torch.Size([512]) False 1532 model.vae.decoder.up_blocks.2.resnets.0.norm1.bias torch.Size([512]) False 1533 model.vae.decoder.up_blocks.2.resnets.0.conv1.weight torch.Size([256, 512, 3, 3]) False 1534 model.vae.decoder.up_blocks.2.resnets.0.conv1.bias torch.Size([256]) False 1535 model.vae.decoder.up_blocks.2.resnets.0.norm2.weight torch.Size([256]) False 1536 model.vae.decoder.up_blocks.2.resnets.0.norm2.bias torch.Size([256]) False 1537 model.vae.decoder.up_blocks.2.resnets.0.conv2.weight torch.Size([256, 256, 3, 3]) False 1538 model.vae.decoder.up_blocks.2.resnets.0.conv2.bias torch.Size([256]) False 1539 model.vae.decoder.up_blocks.2.resnets.0.conv_shortcut.weight torch.Size([256, 512, 1, 1]) False 1540 model.vae.decoder.up_blocks.2.resnets.0.conv_shortcut.bias torch.Size([256]) False 1541 model.vae.decoder.up_blocks.2.resnets.1.norm1.weight torch.Size([256]) False 1542 model.vae.decoder.up_blocks.2.resnets.1.norm1.bias torch.Size([256]) False 1543 model.vae.decoder.up_blocks.2.resnets.1.conv1.weight torch.Size([256, 256, 3, 3]) False 1544 model.vae.decoder.up_blocks.2.resnets.1.conv1.bias torch.Size([256]) False 1545 model.vae.decoder.up_blocks.2.resnets.1.norm2.weight torch.Size([256]) False 1546 model.vae.decoder.up_blocks.2.resnets.1.norm2.bias torch.Size([256]) False 1547 model.vae.decoder.up_blocks.2.resnets.1.conv2.weight torch.Size([256, 256, 3, 3]) False 1548 model.vae.decoder.up_blocks.2.resnets.1.conv2.bias torch.Size([256]) False 1549 model.vae.decoder.up_blocks.2.resnets.2.norm1.weight torch.Size([256]) False 1550 model.vae.decoder.up_blocks.2.resnets.2.norm1.bias torch.Size([256]) False 1551 model.vae.decoder.up_blocks.2.resnets.2.conv1.weight torch.Size([256, 256, 3, 3]) False 1552 model.vae.decoder.up_blocks.2.resnets.2.conv1.bias torch.Size([256]) False 1553 model.vae.decoder.up_blocks.2.resnets.2.norm2.weight torch.Size([256]) False 1554 model.vae.decoder.up_blocks.2.resnets.2.norm2.bias torch.Size([256]) False 1555 model.vae.decoder.up_blocks.2.resnets.2.conv2.weight torch.Size([256, 256, 3, 3]) False 1556 model.vae.decoder.up_blocks.2.resnets.2.conv2.bias torch.Size([256]) False 1557 model.vae.decoder.up_blocks.2.upsamplers.0.conv.weight torch.Size([256, 256, 3, 3]) False 1558 model.vae.decoder.up_blocks.2.upsamplers.0.conv.bias torch.Size([256]) False 1559 model.vae.decoder.up_blocks.3.resnets.0.norm1.weight torch.Size([256]) False 1560 model.vae.decoder.up_blocks.3.resnets.0.norm1.bias torch.Size([256]) False 1561 model.vae.decoder.up_blocks.3.resnets.0.conv1.weight torch.Size([128, 256, 3, 3]) False 1562 model.vae.decoder.up_blocks.3.resnets.0.conv1.bias torch.Size([128]) False 1563 model.vae.decoder.up_blocks.3.resnets.0.norm2.weight torch.Size([128]) False 1564 model.vae.decoder.up_blocks.3.resnets.0.norm2.bias torch.Size([128]) False 1565 model.vae.decoder.up_blocks.3.resnets.0.conv2.weight torch.Size([128, 128, 3, 3]) False 1566 model.vae.decoder.up_blocks.3.resnets.0.conv2.bias torch.Size([128]) False 1567 model.vae.decoder.up_blocks.3.resnets.0.conv_shortcut.weight torch.Size([128, 256, 1, 1]) False 1568 model.vae.decoder.up_blocks.3.resnets.0.conv_shortcut.bias torch.Size([128]) False 1569 model.vae.decoder.up_blocks.3.resnets.1.norm1.weight torch.Size([128]) False 1570 model.vae.decoder.up_blocks.3.resnets.1.norm1.bias torch.Size([128]) False 1571 model.vae.decoder.up_blocks.3.resnets.1.conv1.weight torch.Size([128, 128, 3, 3]) False 1572 model.vae.decoder.up_blocks.3.resnets.1.conv1.bias torch.Size([128]) False 1573 model.vae.decoder.up_blocks.3.resnets.1.norm2.weight torch.Size([128]) False 1574 model.vae.decoder.up_blocks.3.resnets.1.norm2.bias torch.Size([128]) False 1575 model.vae.decoder.up_blocks.3.resnets.1.conv2.weight torch.Size([128, 128, 3, 3]) False 1576 model.vae.decoder.up_blocks.3.resnets.1.conv2.bias torch.Size([128]) False 1577 model.vae.decoder.up_blocks.3.resnets.2.norm1.weight torch.Size([128]) False 1578 model.vae.decoder.up_blocks.3.resnets.2.norm1.bias torch.Size([128]) False 1579 model.vae.decoder.up_blocks.3.resnets.2.conv1.weight torch.Size([128, 128, 3, 3]) False 1580 model.vae.decoder.up_blocks.3.resnets.2.conv1.bias torch.Size([128]) False 1581 model.vae.decoder.up_blocks.3.resnets.2.norm2.weight torch.Size([128]) False 1582 model.vae.decoder.up_blocks.3.resnets.2.norm2.bias torch.Size([128]) False 1583 model.vae.decoder.up_blocks.3.resnets.2.conv2.weight torch.Size([128, 128, 3, 3]) False 1584 model.vae.decoder.up_blocks.3.resnets.2.conv2.bias torch.Size([128]) False 1585 model.vae.decoder.mid_block.attentions.0.group_norm.weight torch.Size([512]) False 1586 model.vae.decoder.mid_block.attentions.0.group_norm.bias torch.Size([512]) False 1587 model.vae.decoder.mid_block.attentions.0.to_q.weight torch.Size([512, 512]) False 1588 model.vae.decoder.mid_block.attentions.0.to_q.bias torch.Size([512]) False 1589 model.vae.decoder.mid_block.attentions.0.to_k.weight torch.Size([512, 512]) False 1590 model.vae.decoder.mid_block.attentions.0.to_k.bias torch.Size([512]) False 1591 model.vae.decoder.mid_block.attentions.0.to_v.weight torch.Size([512, 512]) False 1592 model.vae.decoder.mid_block.attentions.0.to_v.bias torch.Size([512]) False 1593 model.vae.decoder.mid_block.attentions.0.to_out.0.weight torch.Size([512, 512]) False 1594 model.vae.decoder.mid_block.attentions.0.to_out.0.bias torch.Size([512]) False 1595 model.vae.decoder.mid_block.resnets.0.norm1.weight torch.Size([512]) False 1596 model.vae.decoder.mid_block.resnets.0.norm1.bias torch.Size([512]) False 1597 model.vae.decoder.mid_block.resnets.0.conv1.weight torch.Size([512, 512, 3, 3]) False 1598 model.vae.decoder.mid_block.resnets.0.conv1.bias torch.Size([512]) False 1599 model.vae.decoder.mid_block.resnets.0.norm2.weight torch.Size([512]) False 1600 model.vae.decoder.mid_block.resnets.0.norm2.bias torch.Size([512]) False 1601 model.vae.decoder.mid_block.resnets.0.conv2.weight torch.Size([512, 512, 3, 3]) False 1602 model.vae.decoder.mid_block.resnets.0.conv2.bias torch.Size([512]) False 1603 model.vae.decoder.mid_block.resnets.1.norm1.weight torch.Size([512]) False 1604 model.vae.decoder.mid_block.resnets.1.norm1.bias torch.Size([512]) False 1605 model.vae.decoder.mid_block.resnets.1.conv1.weight torch.Size([512, 512, 3, 3]) False 1606 model.vae.decoder.mid_block.resnets.1.conv1.bias torch.Size([512]) False 1607 model.vae.decoder.mid_block.resnets.1.norm2.weight torch.Size([512]) False 1608 model.vae.decoder.mid_block.resnets.1.norm2.bias torch.Size([512]) False 1609 model.vae.decoder.mid_block.resnets.1.conv2.weight torch.Size([512, 512, 3, 3]) False 1610 model.vae.decoder.mid_block.resnets.1.conv2.bias torch.Size([512]) False 1611 model.vae.decoder.conv_norm_out.weight torch.Size([128]) False 1612 model.vae.decoder.conv_norm_out.bias torch.Size([128]) False 1613 model.vae.decoder.conv_out.weight torch.Size([3, 128, 3, 3]) False 1614 model.vae.decoder.conv_out.bias torch.Size([3]) False 1615 model.gen_vision_tower.vision_tower.model.cls_token torch.Size([1, 1, 1792]) False 1616 model.gen_vision_tower.vision_tower.model.pos_embed torch.Size([1, 1025, 1792]) False 1617 model.gen_vision_tower.vision_tower.model.patch_embed.proj.weight torch.Size([1792, 3, 14, 14]) False 1618 model.gen_vision_tower.vision_tower.model.patch_embed.proj.bias torch.Size([1792]) False 1619 model.gen_vision_tower.vision_tower.model.blocks.0.norm1.weight torch.Size([1792]) False 1620 model.gen_vision_tower.vision_tower.model.blocks.0.norm1.bias torch.Size([1792]) False 1621 model.gen_vision_tower.vision_tower.model.blocks.0.attn.qkv.weight torch.Size([5376, 1792]) False 1622 model.gen_vision_tower.vision_tower.model.blocks.0.attn.qkv.bias torch.Size([5376]) False 1623 model.gen_vision_tower.vision_tower.model.blocks.0.attn.proj.weight torch.Size([1792, 1792]) False 1624 model.gen_vision_tower.vision_tower.model.blocks.0.attn.proj.bias torch.Size([1792]) False 1625 model.gen_vision_tower.vision_tower.model.blocks.0.norm2.weight torch.Size([1792]) False 1626 model.gen_vision_tower.vision_tower.model.blocks.0.norm2.bias torch.Size([1792]) False 1627 model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc1.weight torch.Size([15360, 1792]) False 1628 model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc1.bias torch.Size([15360]) False 1629 model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc2.weight torch.Size([1792, 15360]) False 1630 model.gen_vision_tower.vision_tower.model.blocks.0.mlp.fc2.bias torch.Size([1792]) False 1631 model.gen_vision_tower.vision_tower.model.blocks.1.norm1.weight torch.Size([1792]) False 1632 model.gen_vision_tower.vision_tower.model.blocks.1.norm1.bias torch.Size([1792]) False 1633 model.gen_vision_tower.vision_tower.model.blocks.1.attn.qkv.weight torch.Size([5376, 1792]) False 1634 model.gen_vision_tower.vision_tower.model.blocks.1.attn.qkv.bias torch.Size([5376]) False 1635 model.gen_vision_tower.vision_tower.model.blocks.1.attn.proj.weight torch.Size([1792, 1792]) False 1636 model.gen_vision_tower.vision_tower.model.blocks.1.attn.proj.bias torch.Size([1792]) False 1637 model.gen_vision_tower.vision_tower.model.blocks.1.norm2.weight torch.Size([1792]) False 1638 model.gen_vision_tower.vision_tower.model.blocks.1.norm2.bias torch.Size([1792]) False 1639 model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc1.weight torch.Size([15360, 1792]) False 1640 model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc1.bias torch.Size([15360]) False 1641 model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc2.weight torch.Size([1792, 15360]) False 1642 model.gen_vision_tower.vision_tower.model.blocks.1.mlp.fc2.bias torch.Size([1792]) False 1643 model.gen_vision_tower.vision_tower.model.blocks.2.norm1.weight torch.Size([1792]) False 1644 model.gen_vision_tower.vision_tower.model.blocks.2.norm1.bias torch.Size([1792]) False 1645 model.gen_vision_tower.vision_tower.model.blocks.2.attn.qkv.weight torch.Size([5376, 1792]) False 1646 model.gen_vision_tower.vision_tower.model.blocks.2.attn.qkv.bias torch.Size([5376]) False 1647 model.gen_vision_tower.vision_tower.model.blocks.2.attn.proj.weight torch.Size([1792, 1792]) False 1648 model.gen_vision_tower.vision_tower.model.blocks.2.attn.proj.bias torch.Size([1792]) False 1649 model.gen_vision_tower.vision_tower.model.blocks.2.norm2.weight torch.Size([1792]) False 1650 model.gen_vision_tower.vision_tower.model.blocks.2.norm2.bias torch.Size([1792]) False 1651 model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc1.weight torch.Size([15360, 1792]) False 1652 model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc1.bias torch.Size([15360]) False 1653 model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc2.weight torch.Size([1792, 15360]) False 1654 model.gen_vision_tower.vision_tower.model.blocks.2.mlp.fc2.bias torch.Size([1792]) False 1655 model.gen_vision_tower.vision_tower.model.blocks.3.norm1.weight torch.Size([1792]) False 1656 model.gen_vision_tower.vision_tower.model.blocks.3.norm1.bias torch.Size([1792]) False 1657 model.gen_vision_tower.vision_tower.model.blocks.3.attn.qkv.weight torch.Size([5376, 1792]) False 1658 model.gen_vision_tower.vision_tower.model.blocks.3.attn.qkv.bias torch.Size([5376]) False 1659 model.gen_vision_tower.vision_tower.model.blocks.3.attn.proj.weight torch.Size([1792, 1792]) False 1660 model.gen_vision_tower.vision_tower.model.blocks.3.attn.proj.bias torch.Size([1792]) False 1661 model.gen_vision_tower.vision_tower.model.blocks.3.norm2.weight torch.Size([1792]) False 1662 model.gen_vision_tower.vision_tower.model.blocks.3.norm2.bias torch.Size([1792]) False 1663 model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc1.weight torch.Size([15360, 1792]) False 1664 model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc1.bias torch.Size([15360]) False 1665 model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc2.weight torch.Size([1792, 15360]) False 1666 model.gen_vision_tower.vision_tower.model.blocks.3.mlp.fc2.bias torch.Size([1792]) False 1667 model.gen_vision_tower.vision_tower.model.blocks.4.norm1.weight torch.Size([1792]) False 1668 model.gen_vision_tower.vision_tower.model.blocks.4.norm1.bias torch.Size([1792]) False 1669 model.gen_vision_tower.vision_tower.model.blocks.4.attn.qkv.weight torch.Size([5376, 1792]) False 1670 model.gen_vision_tower.vision_tower.model.blocks.4.attn.qkv.bias torch.Size([5376]) False 1671 model.gen_vision_tower.vision_tower.model.blocks.4.attn.proj.weight torch.Size([1792, 1792]) False 1672 model.gen_vision_tower.vision_tower.model.blocks.4.attn.proj.bias torch.Size([1792]) False 1673 model.gen_vision_tower.vision_tower.model.blocks.4.norm2.weight torch.Size([1792]) False 1674 model.gen_vision_tower.vision_tower.model.blocks.4.norm2.bias torch.Size([1792]) False 1675 model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc1.weight torch.Size([15360, 1792]) False 1676 model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc1.bias torch.Size([15360]) False 1677 model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc2.weight torch.Size([1792, 15360]) False 1678 model.gen_vision_tower.vision_tower.model.blocks.4.mlp.fc2.bias torch.Size([1792]) False 1679 model.gen_vision_tower.vision_tower.model.blocks.5.norm1.weight torch.Size([1792]) False 1680 model.gen_vision_tower.vision_tower.model.blocks.5.norm1.bias torch.Size([1792]) False 1681 model.gen_vision_tower.vision_tower.model.blocks.5.attn.qkv.weight torch.Size([5376, 1792]) False 1682 model.gen_vision_tower.vision_tower.model.blocks.5.attn.qkv.bias torch.Size([5376]) False 1683 model.gen_vision_tower.vision_tower.model.blocks.5.attn.proj.weight torch.Size([1792, 1792]) False 1684 model.gen_vision_tower.vision_tower.model.blocks.5.attn.proj.bias torch.Size([1792]) False 1685 model.gen_vision_tower.vision_tower.model.blocks.5.norm2.weight torch.Size([1792]) False 1686 model.gen_vision_tower.vision_tower.model.blocks.5.norm2.bias torch.Size([1792]) False 1687 model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc1.weight torch.Size([15360, 1792]) False 1688 model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc1.bias torch.Size([15360]) False 1689 model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc2.weight torch.Size([1792, 15360]) False 1690 model.gen_vision_tower.vision_tower.model.blocks.5.mlp.fc2.bias torch.Size([1792]) False 1691 model.gen_vision_tower.vision_tower.model.blocks.6.norm1.weight torch.Size([1792]) False 1692 model.gen_vision_tower.vision_tower.model.blocks.6.norm1.bias torch.Size([1792]) False 1693 model.gen_vision_tower.vision_tower.model.blocks.6.attn.qkv.weight torch.Size([5376, 1792]) False 1694 model.gen_vision_tower.vision_tower.model.blocks.6.attn.qkv.bias torch.Size([5376]) False 1695 model.gen_vision_tower.vision_tower.model.blocks.6.attn.proj.weight torch.Size([1792, 1792]) False 1696 model.gen_vision_tower.vision_tower.model.blocks.6.attn.proj.bias torch.Size([1792]) False 1697 model.gen_vision_tower.vision_tower.model.blocks.6.norm2.weight torch.Size([1792]) False 1698 model.gen_vision_tower.vision_tower.model.blocks.6.norm2.bias torch.Size([1792]) False 1699 model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc1.weight torch.Size([15360, 1792]) False 1700 model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc1.bias torch.Size([15360]) False 1701 model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc2.weight torch.Size([1792, 15360]) False 1702 model.gen_vision_tower.vision_tower.model.blocks.6.mlp.fc2.bias torch.Size([1792]) False 1703 model.gen_vision_tower.vision_tower.model.blocks.7.norm1.weight torch.Size([1792]) False 1704 model.gen_vision_tower.vision_tower.model.blocks.7.norm1.bias torch.Size([1792]) False 1705 model.gen_vision_tower.vision_tower.model.blocks.7.attn.qkv.weight torch.Size([5376, 1792]) False 1706 model.gen_vision_tower.vision_tower.model.blocks.7.attn.qkv.bias torch.Size([5376]) False 1707 model.gen_vision_tower.vision_tower.model.blocks.7.attn.proj.weight torch.Size([1792, 1792]) False 1708 model.gen_vision_tower.vision_tower.model.blocks.7.attn.proj.bias torch.Size([1792]) False 1709 model.gen_vision_tower.vision_tower.model.blocks.7.norm2.weight torch.Size([1792]) False 1710 model.gen_vision_tower.vision_tower.model.blocks.7.norm2.bias torch.Size([1792]) False 1711 model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc1.weight torch.Size([15360, 1792]) False 1712 model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc1.bias torch.Size([15360]) False 1713 model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc2.weight torch.Size([1792, 15360]) False 1714 model.gen_vision_tower.vision_tower.model.blocks.7.mlp.fc2.bias torch.Size([1792]) False 1715 model.gen_vision_tower.vision_tower.model.blocks.8.norm1.weight torch.Size([1792]) False 1716 model.gen_vision_tower.vision_tower.model.blocks.8.norm1.bias torch.Size([1792]) False 1717 model.gen_vision_tower.vision_tower.model.blocks.8.attn.qkv.weight torch.Size([5376, 1792]) False 1718 model.gen_vision_tower.vision_tower.model.blocks.8.attn.qkv.bias torch.Size([5376]) False 1719 model.gen_vision_tower.vision_tower.model.blocks.8.attn.proj.weight torch.Size([1792, 1792]) False 1720 model.gen_vision_tower.vision_tower.model.blocks.8.attn.proj.bias torch.Size([1792]) False 1721 model.gen_vision_tower.vision_tower.model.blocks.8.norm2.weight torch.Size([1792]) False 1722 model.gen_vision_tower.vision_tower.model.blocks.8.norm2.bias torch.Size([1792]) False 1723 model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc1.weight torch.Size([15360, 1792]) False 1724 model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc1.bias torch.Size([15360]) False 1725 model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc2.weight torch.Size([1792, 15360]) False 1726 model.gen_vision_tower.vision_tower.model.blocks.8.mlp.fc2.bias torch.Size([1792]) False 1727 model.gen_vision_tower.vision_tower.model.blocks.9.norm1.weight torch.Size([1792]) False 1728 model.gen_vision_tower.vision_tower.model.blocks.9.norm1.bias torch.Size([1792]) False 1729 model.gen_vision_tower.vision_tower.model.blocks.9.attn.qkv.weight torch.Size([5376, 1792]) False 1730 model.gen_vision_tower.vision_tower.model.blocks.9.attn.qkv.bias torch.Size([5376]) False 1731 model.gen_vision_tower.vision_tower.model.blocks.9.attn.proj.weight torch.Size([1792, 1792]) False 1732 model.gen_vision_tower.vision_tower.model.blocks.9.attn.proj.bias torch.Size([1792]) False 1733 model.gen_vision_tower.vision_tower.model.blocks.9.norm2.weight torch.Size([1792]) False 1734 model.gen_vision_tower.vision_tower.model.blocks.9.norm2.bias torch.Size([1792]) False 1735 model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc1.weight torch.Size([15360, 1792]) False 1736 model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc1.bias torch.Size([15360]) False 1737 model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc2.weight torch.Size([1792, 15360]) False 1738 model.gen_vision_tower.vision_tower.model.blocks.9.mlp.fc2.bias torch.Size([1792]) False 1739 model.gen_vision_tower.vision_tower.model.blocks.10.norm1.weight torch.Size([1792]) False 1740 model.gen_vision_tower.vision_tower.model.blocks.10.norm1.bias torch.Size([1792]) False 1741 model.gen_vision_tower.vision_tower.model.blocks.10.attn.qkv.weight torch.Size([5376, 1792]) False 1742 model.gen_vision_tower.vision_tower.model.blocks.10.attn.qkv.bias torch.Size([5376]) False 1743 model.gen_vision_tower.vision_tower.model.blocks.10.attn.proj.weight torch.Size([1792, 1792]) False 1744 model.gen_vision_tower.vision_tower.model.blocks.10.attn.proj.bias torch.Size([1792]) False 1745 model.gen_vision_tower.vision_tower.model.blocks.10.norm2.weight torch.Size([1792]) False 1746 model.gen_vision_tower.vision_tower.model.blocks.10.norm2.bias torch.Size([1792]) False 1747 model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc1.weight torch.Size([15360, 1792]) False 1748 model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc1.bias torch.Size([15360]) False 1749 model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc2.weight torch.Size([1792, 15360]) False 1750 model.gen_vision_tower.vision_tower.model.blocks.10.mlp.fc2.bias torch.Size([1792]) False 1751 model.gen_vision_tower.vision_tower.model.blocks.11.norm1.weight torch.Size([1792]) False 1752 model.gen_vision_tower.vision_tower.model.blocks.11.norm1.bias torch.Size([1792]) False 1753 model.gen_vision_tower.vision_tower.model.blocks.11.attn.qkv.weight torch.Size([5376, 1792]) False 1754 model.gen_vision_tower.vision_tower.model.blocks.11.attn.qkv.bias torch.Size([5376]) False 1755 model.gen_vision_tower.vision_tower.model.blocks.11.attn.proj.weight torch.Size([1792, 1792]) False 1756 model.gen_vision_tower.vision_tower.model.blocks.11.attn.proj.bias torch.Size([1792]) False 1757 model.gen_vision_tower.vision_tower.model.blocks.11.norm2.weight torch.Size([1792]) False 1758 model.gen_vision_tower.vision_tower.model.blocks.11.norm2.bias torch.Size([1792]) False 1759 model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc1.weight torch.Size([15360, 1792]) False 1760 model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc1.bias torch.Size([15360]) False 1761 model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc2.weight torch.Size([1792, 15360]) False 1762 model.gen_vision_tower.vision_tower.model.blocks.11.mlp.fc2.bias torch.Size([1792]) False 1763 model.gen_vision_tower.vision_tower.model.blocks.12.norm1.weight torch.Size([1792]) False 1764 model.gen_vision_tower.vision_tower.model.blocks.12.norm1.bias torch.Size([1792]) False 1765 model.gen_vision_tower.vision_tower.model.blocks.12.attn.qkv.weight torch.Size([5376, 1792]) False 1766 model.gen_vision_tower.vision_tower.model.blocks.12.attn.qkv.bias torch.Size([5376]) False 1767 model.gen_vision_tower.vision_tower.model.blocks.12.attn.proj.weight torch.Size([1792, 1792]) False 1768 model.gen_vision_tower.vision_tower.model.blocks.12.attn.proj.bias torch.Size([1792]) False 1769 model.gen_vision_tower.vision_tower.model.blocks.12.norm2.weight torch.Size([1792]) False 1770 model.gen_vision_tower.vision_tower.model.blocks.12.norm2.bias torch.Size([1792]) False 1771 model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc1.weight torch.Size([15360, 1792]) False 1772 model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc1.bias torch.Size([15360]) False 1773 model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc2.weight torch.Size([1792, 15360]) False 1774 model.gen_vision_tower.vision_tower.model.blocks.12.mlp.fc2.bias torch.Size([1792]) False 1775 model.gen_vision_tower.vision_tower.model.blocks.13.norm1.weight torch.Size([1792]) False 1776 model.gen_vision_tower.vision_tower.model.blocks.13.norm1.bias torch.Size([1792]) False 1777 model.gen_vision_tower.vision_tower.model.blocks.13.attn.qkv.weight torch.Size([5376, 1792]) False 1778 model.gen_vision_tower.vision_tower.model.blocks.13.attn.qkv.bias torch.Size([5376]) False 1779 model.gen_vision_tower.vision_tower.model.blocks.13.attn.proj.weight torch.Size([1792, 1792]) False 1780 model.gen_vision_tower.vision_tower.model.blocks.13.attn.proj.bias torch.Size([1792]) False 1781 model.gen_vision_tower.vision_tower.model.blocks.13.norm2.weight torch.Size([1792]) False 1782 model.gen_vision_tower.vision_tower.model.blocks.13.norm2.bias torch.Size([1792]) False 1783 model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc1.weight torch.Size([15360, 1792]) False 1784 model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc1.bias torch.Size([15360]) False 1785 model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc2.weight torch.Size([1792, 15360]) False 1786 model.gen_vision_tower.vision_tower.model.blocks.13.mlp.fc2.bias torch.Size([1792]) False 1787 model.gen_vision_tower.vision_tower.model.blocks.14.norm1.weight torch.Size([1792]) False 1788 model.gen_vision_tower.vision_tower.model.blocks.14.norm1.bias torch.Size([1792]) False 1789 model.gen_vision_tower.vision_tower.model.blocks.14.attn.qkv.weight torch.Size([5376, 1792]) False 1790 model.gen_vision_tower.vision_tower.model.blocks.14.attn.qkv.bias torch.Size([5376]) False 1791 model.gen_vision_tower.vision_tower.model.blocks.14.attn.proj.weight torch.Size([1792, 1792]) False 1792 model.gen_vision_tower.vision_tower.model.blocks.14.attn.proj.bias torch.Size([1792]) False 1793 model.gen_vision_tower.vision_tower.model.blocks.14.norm2.weight torch.Size([1792]) False 1794 model.gen_vision_tower.vision_tower.model.blocks.14.norm2.bias torch.Size([1792]) False 1795 model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc1.weight torch.Size([15360, 1792]) False 1796 model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc1.bias torch.Size([15360]) False 1797 model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc2.weight torch.Size([1792, 15360]) False 1798 model.gen_vision_tower.vision_tower.model.blocks.14.mlp.fc2.bias torch.Size([1792]) False 1799 model.gen_vision_tower.vision_tower.model.blocks.15.norm1.weight torch.Size([1792]) False 1800 model.gen_vision_tower.vision_tower.model.blocks.15.norm1.bias torch.Size([1792]) False 1801 model.gen_vision_tower.vision_tower.model.blocks.15.attn.qkv.weight torch.Size([5376, 1792]) False 1802 model.gen_vision_tower.vision_tower.model.blocks.15.attn.qkv.bias torch.Size([5376]) False 1803 model.gen_vision_tower.vision_tower.model.blocks.15.attn.proj.weight torch.Size([1792, 1792]) False 1804 model.gen_vision_tower.vision_tower.model.blocks.15.attn.proj.bias torch.Size([1792]) False 1805 model.gen_vision_tower.vision_tower.model.blocks.15.norm2.weight torch.Size([1792]) False 1806 model.gen_vision_tower.vision_tower.model.blocks.15.norm2.bias torch.Size([1792]) False 1807 model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc1.weight torch.Size([15360, 1792]) False 1808 model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc1.bias torch.Size([15360]) False 1809 model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc2.weight torch.Size([1792, 15360]) False 1810 model.gen_vision_tower.vision_tower.model.blocks.15.mlp.fc2.bias torch.Size([1792]) False 1811 model.gen_vision_tower.vision_tower.model.blocks.16.norm1.weight torch.Size([1792]) False 1812 model.gen_vision_tower.vision_tower.model.blocks.16.norm1.bias torch.Size([1792]) False 1813 model.gen_vision_tower.vision_tower.model.blocks.16.attn.qkv.weight torch.Size([5376, 1792]) False 1814 model.gen_vision_tower.vision_tower.model.blocks.16.attn.qkv.bias torch.Size([5376]) False 1815 model.gen_vision_tower.vision_tower.model.blocks.16.attn.proj.weight torch.Size([1792, 1792]) False 1816 model.gen_vision_tower.vision_tower.model.blocks.16.attn.proj.bias torch.Size([1792]) False 1817 model.gen_vision_tower.vision_tower.model.blocks.16.norm2.weight torch.Size([1792]) False 1818 model.gen_vision_tower.vision_tower.model.blocks.16.norm2.bias torch.Size([1792]) False 1819 model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc1.weight torch.Size([15360, 1792]) False 1820 model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc1.bias torch.Size([15360]) False 1821 model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc2.weight torch.Size([1792, 15360]) False 1822 model.gen_vision_tower.vision_tower.model.blocks.16.mlp.fc2.bias torch.Size([1792]) False 1823 model.gen_vision_tower.vision_tower.model.blocks.17.norm1.weight torch.Size([1792]) False 1824 model.gen_vision_tower.vision_tower.model.blocks.17.norm1.bias torch.Size([1792]) False 1825 model.gen_vision_tower.vision_tower.model.blocks.17.attn.qkv.weight torch.Size([5376, 1792]) False 1826 model.gen_vision_tower.vision_tower.model.blocks.17.attn.qkv.bias torch.Size([5376]) False 1827 model.gen_vision_tower.vision_tower.model.blocks.17.attn.proj.weight torch.Size([1792, 1792]) False 1828 model.gen_vision_tower.vision_tower.model.blocks.17.attn.proj.bias torch.Size([1792]) False 1829 model.gen_vision_tower.vision_tower.model.blocks.17.norm2.weight torch.Size([1792]) False 1830 model.gen_vision_tower.vision_tower.model.blocks.17.norm2.bias torch.Size([1792]) False 1831 model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc1.weight torch.Size([15360, 1792]) False 1832 model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc1.bias torch.Size([15360]) False 1833 model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc2.weight torch.Size([1792, 15360]) False 1834 model.gen_vision_tower.vision_tower.model.blocks.17.mlp.fc2.bias torch.Size([1792]) False 1835 model.gen_vision_tower.vision_tower.model.blocks.18.norm1.weight torch.Size([1792]) False 1836 model.gen_vision_tower.vision_tower.model.blocks.18.norm1.bias torch.Size([1792]) False 1837 model.gen_vision_tower.vision_tower.model.blocks.18.attn.qkv.weight torch.Size([5376, 1792]) False 1838 model.gen_vision_tower.vision_tower.model.blocks.18.attn.qkv.bias torch.Size([5376]) False 1839 model.gen_vision_tower.vision_tower.model.blocks.18.attn.proj.weight torch.Size([1792, 1792]) False 1840 model.gen_vision_tower.vision_tower.model.blocks.18.attn.proj.bias torch.Size([1792]) False 1841 model.gen_vision_tower.vision_tower.model.blocks.18.norm2.weight torch.Size([1792]) False 1842 model.gen_vision_tower.vision_tower.model.blocks.18.norm2.bias torch.Size([1792]) False 1843 model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc1.weight torch.Size([15360, 1792]) False 1844 model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc1.bias torch.Size([15360]) False 1845 model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc2.weight torch.Size([1792, 15360]) False 1846 model.gen_vision_tower.vision_tower.model.blocks.18.mlp.fc2.bias torch.Size([1792]) False 1847 model.gen_vision_tower.vision_tower.model.blocks.19.norm1.weight torch.Size([1792]) False 1848 model.gen_vision_tower.vision_tower.model.blocks.19.norm1.bias torch.Size([1792]) False 1849 model.gen_vision_tower.vision_tower.model.blocks.19.attn.qkv.weight torch.Size([5376, 1792]) False 1850 model.gen_vision_tower.vision_tower.model.blocks.19.attn.qkv.bias torch.Size([5376]) False 1851 model.gen_vision_tower.vision_tower.model.blocks.19.attn.proj.weight torch.Size([1792, 1792]) False 1852 model.gen_vision_tower.vision_tower.model.blocks.19.attn.proj.bias torch.Size([1792]) False 1853 model.gen_vision_tower.vision_tower.model.blocks.19.norm2.weight torch.Size([1792]) False 1854 model.gen_vision_tower.vision_tower.model.blocks.19.norm2.bias torch.Size([1792]) False 1855 model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc1.weight torch.Size([15360, 1792]) False 1856 model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc1.bias torch.Size([15360]) False 1857 model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc2.weight torch.Size([1792, 15360]) False 1858 model.gen_vision_tower.vision_tower.model.blocks.19.mlp.fc2.bias torch.Size([1792]) False 1859 model.gen_vision_tower.vision_tower.model.blocks.20.norm1.weight torch.Size([1792]) False 1860 model.gen_vision_tower.vision_tower.model.blocks.20.norm1.bias torch.Size([1792]) False 1861 model.gen_vision_tower.vision_tower.model.blocks.20.attn.qkv.weight torch.Size([5376, 1792]) False 1862 model.gen_vision_tower.vision_tower.model.blocks.20.attn.qkv.bias torch.Size([5376]) False 1863 model.gen_vision_tower.vision_tower.model.blocks.20.attn.proj.weight torch.Size([1792, 1792]) False 1864 model.gen_vision_tower.vision_tower.model.blocks.20.attn.proj.bias torch.Size([1792]) False 1865 model.gen_vision_tower.vision_tower.model.blocks.20.norm2.weight torch.Size([1792]) False 1866 model.gen_vision_tower.vision_tower.model.blocks.20.norm2.bias torch.Size([1792]) False 1867 model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc1.weight torch.Size([15360, 1792]) False 1868 model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc1.bias torch.Size([15360]) False 1869 model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc2.weight torch.Size([1792, 15360]) False 1870 model.gen_vision_tower.vision_tower.model.blocks.20.mlp.fc2.bias torch.Size([1792]) False 1871 model.gen_vision_tower.vision_tower.model.blocks.21.norm1.weight torch.Size([1792]) False 1872 model.gen_vision_tower.vision_tower.model.blocks.21.norm1.bias torch.Size([1792]) False 1873 model.gen_vision_tower.vision_tower.model.blocks.21.attn.qkv.weight torch.Size([5376, 1792]) False 1874 model.gen_vision_tower.vision_tower.model.blocks.21.attn.qkv.bias torch.Size([5376]) False 1875 model.gen_vision_tower.vision_tower.model.blocks.21.attn.proj.weight torch.Size([1792, 1792]) False 1876 model.gen_vision_tower.vision_tower.model.blocks.21.attn.proj.bias torch.Size([1792]) False 1877 model.gen_vision_tower.vision_tower.model.blocks.21.norm2.weight torch.Size([1792]) False 1878 model.gen_vision_tower.vision_tower.model.blocks.21.norm2.bias torch.Size([1792]) False 1879 model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc1.weight torch.Size([15360, 1792]) False 1880 model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc1.bias torch.Size([15360]) False 1881 model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc2.weight torch.Size([1792, 15360]) False 1882 model.gen_vision_tower.vision_tower.model.blocks.21.mlp.fc2.bias torch.Size([1792]) False 1883 model.gen_vision_tower.vision_tower.model.blocks.22.norm1.weight torch.Size([1792]) False 1884 model.gen_vision_tower.vision_tower.model.blocks.22.norm1.bias torch.Size([1792]) False 1885 model.gen_vision_tower.vision_tower.model.blocks.22.attn.qkv.weight torch.Size([5376, 1792]) False 1886 model.gen_vision_tower.vision_tower.model.blocks.22.attn.qkv.bias torch.Size([5376]) False 1887 model.gen_vision_tower.vision_tower.model.blocks.22.attn.proj.weight torch.Size([1792, 1792]) False 1888 model.gen_vision_tower.vision_tower.model.blocks.22.attn.proj.bias torch.Size([1792]) False 1889 model.gen_vision_tower.vision_tower.model.blocks.22.norm2.weight torch.Size([1792]) False 1890 model.gen_vision_tower.vision_tower.model.blocks.22.norm2.bias torch.Size([1792]) False 1891 model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc1.weight torch.Size([15360, 1792]) False 1892 model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc1.bias torch.Size([15360]) False 1893 model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc2.weight torch.Size([1792, 15360]) False 1894 model.gen_vision_tower.vision_tower.model.blocks.22.mlp.fc2.bias torch.Size([1792]) False 1895 model.gen_vision_tower.vision_tower.model.blocks.23.norm1.weight torch.Size([1792]) False 1896 model.gen_vision_tower.vision_tower.model.blocks.23.norm1.bias torch.Size([1792]) False 1897 model.gen_vision_tower.vision_tower.model.blocks.23.attn.qkv.weight torch.Size([5376, 1792]) False 1898 model.gen_vision_tower.vision_tower.model.blocks.23.attn.qkv.bias torch.Size([5376]) False 1899 model.gen_vision_tower.vision_tower.model.blocks.23.attn.proj.weight torch.Size([1792, 1792]) False 1900 model.gen_vision_tower.vision_tower.model.blocks.23.attn.proj.bias torch.Size([1792]) False 1901 model.gen_vision_tower.vision_tower.model.blocks.23.norm2.weight torch.Size([1792]) False 1902 model.gen_vision_tower.vision_tower.model.blocks.23.norm2.bias torch.Size([1792]) False 1903 model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc1.weight torch.Size([15360, 1792]) False 1904 model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc1.bias torch.Size([15360]) False 1905 model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc2.weight torch.Size([1792, 15360]) False 1906 model.gen_vision_tower.vision_tower.model.blocks.23.mlp.fc2.bias torch.Size([1792]) False 1907 model.gen_vision_tower.vision_tower.model.blocks.24.norm1.weight torch.Size([1792]) False 1908 model.gen_vision_tower.vision_tower.model.blocks.24.norm1.bias torch.Size([1792]) False 1909 model.gen_vision_tower.vision_tower.model.blocks.24.attn.qkv.weight torch.Size([5376, 1792]) False 1910 model.gen_vision_tower.vision_tower.model.blocks.24.attn.qkv.bias torch.Size([5376]) False 1911 model.gen_vision_tower.vision_tower.model.blocks.24.attn.proj.weight torch.Size([1792, 1792]) False 1912 model.gen_vision_tower.vision_tower.model.blocks.24.attn.proj.bias torch.Size([1792]) False 1913 model.gen_vision_tower.vision_tower.model.blocks.24.norm2.weight torch.Size([1792]) False 1914 model.gen_vision_tower.vision_tower.model.blocks.24.norm2.bias torch.Size([1792]) False 1915 model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc1.weight torch.Size([15360, 1792]) False 1916 model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc1.bias torch.Size([15360]) False 1917 model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc2.weight torch.Size([1792, 15360]) False 1918 model.gen_vision_tower.vision_tower.model.blocks.24.mlp.fc2.bias torch.Size([1792]) False 1919 model.gen_vision_tower.vision_tower.model.blocks.25.norm1.weight torch.Size([1792]) False 1920 model.gen_vision_tower.vision_tower.model.blocks.25.norm1.bias torch.Size([1792]) False 1921 model.gen_vision_tower.vision_tower.model.blocks.25.attn.qkv.weight torch.Size([5376, 1792]) False 1922 model.gen_vision_tower.vision_tower.model.blocks.25.attn.qkv.bias torch.Size([5376]) False 1923 model.gen_vision_tower.vision_tower.model.blocks.25.attn.proj.weight torch.Size([1792, 1792]) False 1924 model.gen_vision_tower.vision_tower.model.blocks.25.attn.proj.bias torch.Size([1792]) False 1925 model.gen_vision_tower.vision_tower.model.blocks.25.norm2.weight torch.Size([1792]) False 1926 model.gen_vision_tower.vision_tower.model.blocks.25.norm2.bias torch.Size([1792]) False 1927 model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc1.weight torch.Size([15360, 1792]) False 1928 model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc1.bias torch.Size([15360]) False 1929 model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc2.weight torch.Size([1792, 15360]) False 1930 model.gen_vision_tower.vision_tower.model.blocks.25.mlp.fc2.bias torch.Size([1792]) False 1931 model.gen_vision_tower.vision_tower.model.blocks.26.norm1.weight torch.Size([1792]) False 1932 model.gen_vision_tower.vision_tower.model.blocks.26.norm1.bias torch.Size([1792]) False 1933 model.gen_vision_tower.vision_tower.model.blocks.26.attn.qkv.weight torch.Size([5376, 1792]) False 1934 model.gen_vision_tower.vision_tower.model.blocks.26.attn.qkv.bias torch.Size([5376]) False 1935 model.gen_vision_tower.vision_tower.model.blocks.26.attn.proj.weight torch.Size([1792, 1792]) False 1936 model.gen_vision_tower.vision_tower.model.blocks.26.attn.proj.bias torch.Size([1792]) False 1937 model.gen_vision_tower.vision_tower.model.blocks.26.norm2.weight torch.Size([1792]) False 1938 model.gen_vision_tower.vision_tower.model.blocks.26.norm2.bias torch.Size([1792]) False 1939 model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc1.weight torch.Size([15360, 1792]) False 1940 model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc1.bias torch.Size([15360]) False 1941 model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc2.weight torch.Size([1792, 15360]) False 1942 model.gen_vision_tower.vision_tower.model.blocks.26.mlp.fc2.bias torch.Size([1792]) False 1943 model.gen_vision_tower.vision_tower.model.blocks.27.norm1.weight torch.Size([1792]) False 1944 model.gen_vision_tower.vision_tower.model.blocks.27.norm1.bias torch.Size([1792]) False 1945 model.gen_vision_tower.vision_tower.model.blocks.27.attn.qkv.weight torch.Size([5376, 1792]) False 1946 model.gen_vision_tower.vision_tower.model.blocks.27.attn.qkv.bias torch.Size([5376]) False 1947 model.gen_vision_tower.vision_tower.model.blocks.27.attn.proj.weight torch.Size([1792, 1792]) False 1948 model.gen_vision_tower.vision_tower.model.blocks.27.attn.proj.bias torch.Size([1792]) False 1949 model.gen_vision_tower.vision_tower.model.blocks.27.norm2.weight torch.Size([1792]) False 1950 model.gen_vision_tower.vision_tower.model.blocks.27.norm2.bias torch.Size([1792]) False 1951 model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc1.weight torch.Size([15360, 1792]) False 1952 model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc1.bias torch.Size([15360]) False 1953 model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc2.weight torch.Size([1792, 15360]) False 1954 model.gen_vision_tower.vision_tower.model.blocks.27.mlp.fc2.bias torch.Size([1792]) False 1955 model.gen_vision_tower.vision_tower.model.blocks.28.norm1.weight torch.Size([1792]) False 1956 model.gen_vision_tower.vision_tower.model.blocks.28.norm1.bias torch.Size([1792]) False 1957 model.gen_vision_tower.vision_tower.model.blocks.28.attn.qkv.weight torch.Size([5376, 1792]) False 1958 model.gen_vision_tower.vision_tower.model.blocks.28.attn.qkv.bias torch.Size([5376]) False 1959 model.gen_vision_tower.vision_tower.model.blocks.28.attn.proj.weight torch.Size([1792, 1792]) False 1960 model.gen_vision_tower.vision_tower.model.blocks.28.attn.proj.bias torch.Size([1792]) False 1961 model.gen_vision_tower.vision_tower.model.blocks.28.norm2.weight torch.Size([1792]) False 1962 model.gen_vision_tower.vision_tower.model.blocks.28.norm2.bias torch.Size([1792]) False 1963 model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc1.weight torch.Size([15360, 1792]) False 1964 model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc1.bias torch.Size([15360]) False 1965 model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc2.weight torch.Size([1792, 15360]) False 1966 model.gen_vision_tower.vision_tower.model.blocks.28.mlp.fc2.bias torch.Size([1792]) False 1967 model.gen_vision_tower.vision_tower.model.blocks.29.norm1.weight torch.Size([1792]) False 1968 model.gen_vision_tower.vision_tower.model.blocks.29.norm1.bias torch.Size([1792]) False 1969 model.gen_vision_tower.vision_tower.model.blocks.29.attn.qkv.weight torch.Size([5376, 1792]) False 1970 model.gen_vision_tower.vision_tower.model.blocks.29.attn.qkv.bias torch.Size([5376]) False 1971 model.gen_vision_tower.vision_tower.model.blocks.29.attn.proj.weight torch.Size([1792, 1792]) False 1972 model.gen_vision_tower.vision_tower.model.blocks.29.attn.proj.bias torch.Size([1792]) False 1973 model.gen_vision_tower.vision_tower.model.blocks.29.norm2.weight torch.Size([1792]) False 1974 model.gen_vision_tower.vision_tower.model.blocks.29.norm2.bias torch.Size([1792]) False 1975 model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc1.weight torch.Size([15360, 1792]) False 1976 model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc1.bias torch.Size([15360]) False 1977 model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc2.weight torch.Size([1792, 15360]) False 1978 model.gen_vision_tower.vision_tower.model.blocks.29.mlp.fc2.bias torch.Size([1792]) False 1979 model.gen_vision_tower.vision_tower.model.blocks.30.norm1.weight torch.Size([1792]) False 1980 model.gen_vision_tower.vision_tower.model.blocks.30.norm1.bias torch.Size([1792]) False 1981 model.gen_vision_tower.vision_tower.model.blocks.30.attn.qkv.weight torch.Size([5376, 1792]) False 1982 model.gen_vision_tower.vision_tower.model.blocks.30.attn.qkv.bias torch.Size([5376]) False 1983 model.gen_vision_tower.vision_tower.model.blocks.30.attn.proj.weight torch.Size([1792, 1792]) False 1984 model.gen_vision_tower.vision_tower.model.blocks.30.attn.proj.bias torch.Size([1792]) False 1985 model.gen_vision_tower.vision_tower.model.blocks.30.norm2.weight torch.Size([1792]) False 1986 model.gen_vision_tower.vision_tower.model.blocks.30.norm2.bias torch.Size([1792]) False 1987 model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc1.weight torch.Size([15360, 1792]) False 1988 model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc1.bias torch.Size([15360]) False 1989 model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc2.weight torch.Size([1792, 15360]) False 1990 model.gen_vision_tower.vision_tower.model.blocks.30.mlp.fc2.bias torch.Size([1792]) False 1991 model.gen_vision_tower.vision_tower.model.blocks.31.norm1.weight torch.Size([1792]) False 1992 model.gen_vision_tower.vision_tower.model.blocks.31.norm1.bias torch.Size([1792]) False 1993 model.gen_vision_tower.vision_tower.model.blocks.31.attn.qkv.weight torch.Size([5376, 1792]) False 1994 model.gen_vision_tower.vision_tower.model.blocks.31.attn.qkv.bias torch.Size([5376]) False 1995 model.gen_vision_tower.vision_tower.model.blocks.31.attn.proj.weight torch.Size([1792, 1792]) False 1996 model.gen_vision_tower.vision_tower.model.blocks.31.attn.proj.bias torch.Size([1792]) False 1997 model.gen_vision_tower.vision_tower.model.blocks.31.norm2.weight torch.Size([1792]) False 1998 model.gen_vision_tower.vision_tower.model.blocks.31.norm2.bias torch.Size([1792]) False 1999 model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc1.weight torch.Size([15360, 1792]) False 2000 model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc1.bias torch.Size([15360]) False 2001 model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc2.weight torch.Size([1792, 15360]) False 2002 model.gen_vision_tower.vision_tower.model.blocks.31.mlp.fc2.bias torch.Size([1792]) False 2003 model.gen_vision_tower.vision_tower.model.blocks.32.norm1.weight torch.Size([1792]) False 2004 model.gen_vision_tower.vision_tower.model.blocks.32.norm1.bias torch.Size([1792]) False 2005 model.gen_vision_tower.vision_tower.model.blocks.32.attn.qkv.weight torch.Size([5376, 1792]) False 2006 model.gen_vision_tower.vision_tower.model.blocks.32.attn.qkv.bias torch.Size([5376]) False 2007 model.gen_vision_tower.vision_tower.model.blocks.32.attn.proj.weight torch.Size([1792, 1792]) False 2008 model.gen_vision_tower.vision_tower.model.blocks.32.attn.proj.bias torch.Size([1792]) False 2009 model.gen_vision_tower.vision_tower.model.blocks.32.norm2.weight torch.Size([1792]) False 2010 model.gen_vision_tower.vision_tower.model.blocks.32.norm2.bias torch.Size([1792]) False 2011 model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc1.weight torch.Size([15360, 1792]) False 2012 model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc1.bias torch.Size([15360]) False 2013 model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc2.weight torch.Size([1792, 15360]) False 2014 model.gen_vision_tower.vision_tower.model.blocks.32.mlp.fc2.bias torch.Size([1792]) False 2015 model.gen_vision_tower.vision_tower.model.blocks.33.norm1.weight torch.Size([1792]) False 2016 model.gen_vision_tower.vision_tower.model.blocks.33.norm1.bias torch.Size([1792]) False 2017 model.gen_vision_tower.vision_tower.model.blocks.33.attn.qkv.weight torch.Size([5376, 1792]) False 2018 model.gen_vision_tower.vision_tower.model.blocks.33.attn.qkv.bias torch.Size([5376]) False 2019 model.gen_vision_tower.vision_tower.model.blocks.33.attn.proj.weight torch.Size([1792, 1792]) False 2020 model.gen_vision_tower.vision_tower.model.blocks.33.attn.proj.bias torch.Size([1792]) False 2021 model.gen_vision_tower.vision_tower.model.blocks.33.norm2.weight torch.Size([1792]) False 2022 model.gen_vision_tower.vision_tower.model.blocks.33.norm2.bias torch.Size([1792]) False 2023 model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc1.weight torch.Size([15360, 1792]) False 2024 model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc1.bias torch.Size([15360]) False 2025 model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc2.weight torch.Size([1792, 15360]) False 2026 model.gen_vision_tower.vision_tower.model.blocks.33.mlp.fc2.bias torch.Size([1792]) False 2027 model.gen_vision_tower.vision_tower.model.blocks.34.norm1.weight torch.Size([1792]) False 2028 model.gen_vision_tower.vision_tower.model.blocks.34.norm1.bias torch.Size([1792]) False 2029 model.gen_vision_tower.vision_tower.model.blocks.34.attn.qkv.weight torch.Size([5376, 1792]) False 2030 model.gen_vision_tower.vision_tower.model.blocks.34.attn.qkv.bias torch.Size([5376]) False 2031 model.gen_vision_tower.vision_tower.model.blocks.34.attn.proj.weight torch.Size([1792, 1792]) False 2032 model.gen_vision_tower.vision_tower.model.blocks.34.attn.proj.bias torch.Size([1792]) False 2033 model.gen_vision_tower.vision_tower.model.blocks.34.norm2.weight torch.Size([1792]) False 2034 model.gen_vision_tower.vision_tower.model.blocks.34.norm2.bias torch.Size([1792]) False 2035 model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc1.weight torch.Size([15360, 1792]) False 2036 model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc1.bias torch.Size([15360]) False 2037 model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc2.weight torch.Size([1792, 15360]) False 2038 model.gen_vision_tower.vision_tower.model.blocks.34.mlp.fc2.bias torch.Size([1792]) False 2039 model.gen_vision_tower.vision_tower.model.blocks.35.norm1.weight torch.Size([1792]) False 2040 model.gen_vision_tower.vision_tower.model.blocks.35.norm1.bias torch.Size([1792]) False 2041 model.gen_vision_tower.vision_tower.model.blocks.35.attn.qkv.weight torch.Size([5376, 1792]) False 2042 model.gen_vision_tower.vision_tower.model.blocks.35.attn.qkv.bias torch.Size([5376]) False 2043 model.gen_vision_tower.vision_tower.model.blocks.35.attn.proj.weight torch.Size([1792, 1792]) False 2044 model.gen_vision_tower.vision_tower.model.blocks.35.attn.proj.bias torch.Size([1792]) False 2045 model.gen_vision_tower.vision_tower.model.blocks.35.norm2.weight torch.Size([1792]) False 2046 model.gen_vision_tower.vision_tower.model.blocks.35.norm2.bias torch.Size([1792]) False 2047 model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc1.weight torch.Size([15360, 1792]) False 2048 model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc1.bias torch.Size([15360]) False 2049 model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc2.weight torch.Size([1792, 15360]) False 2050 model.gen_vision_tower.vision_tower.model.blocks.35.mlp.fc2.bias torch.Size([1792]) False 2051 model.gen_vision_tower.vision_tower.model.blocks.36.norm1.weight torch.Size([1792]) False 2052 model.gen_vision_tower.vision_tower.model.blocks.36.norm1.bias torch.Size([1792]) False 2053 model.gen_vision_tower.vision_tower.model.blocks.36.attn.qkv.weight torch.Size([5376, 1792]) False 2054 model.gen_vision_tower.vision_tower.model.blocks.36.attn.qkv.bias torch.Size([5376]) False 2055 model.gen_vision_tower.vision_tower.model.blocks.36.attn.proj.weight torch.Size([1792, 1792]) False 2056 model.gen_vision_tower.vision_tower.model.blocks.36.attn.proj.bias torch.Size([1792]) False 2057 model.gen_vision_tower.vision_tower.model.blocks.36.norm2.weight torch.Size([1792]) False 2058 model.gen_vision_tower.vision_tower.model.blocks.36.norm2.bias torch.Size([1792]) False 2059 model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc1.weight torch.Size([15360, 1792]) False 2060 model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc1.bias torch.Size([15360]) False 2061 model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc2.weight torch.Size([1792, 15360]) False 2062 model.gen_vision_tower.vision_tower.model.blocks.36.mlp.fc2.bias torch.Size([1792]) False 2063 model.gen_vision_tower.vision_tower.model.blocks.37.norm1.weight torch.Size([1792]) False 2064 model.gen_vision_tower.vision_tower.model.blocks.37.norm1.bias torch.Size([1792]) False 2065 model.gen_vision_tower.vision_tower.model.blocks.37.attn.qkv.weight torch.Size([5376, 1792]) False 2066 model.gen_vision_tower.vision_tower.model.blocks.37.attn.qkv.bias torch.Size([5376]) False 2067 model.gen_vision_tower.vision_tower.model.blocks.37.attn.proj.weight torch.Size([1792, 1792]) False 2068 model.gen_vision_tower.vision_tower.model.blocks.37.attn.proj.bias torch.Size([1792]) False 2069 model.gen_vision_tower.vision_tower.model.blocks.37.norm2.weight torch.Size([1792]) False 2070 model.gen_vision_tower.vision_tower.model.blocks.37.norm2.bias torch.Size([1792]) False 2071 model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc1.weight torch.Size([15360, 1792]) False 2072 model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc1.bias torch.Size([15360]) False 2073 model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc2.weight torch.Size([1792, 15360]) False 2074 model.gen_vision_tower.vision_tower.model.blocks.37.mlp.fc2.bias torch.Size([1792]) False 2075 model.gen_vision_tower.vision_tower.model.blocks.38.norm1.weight torch.Size([1792]) False 2076 model.gen_vision_tower.vision_tower.model.blocks.38.norm1.bias torch.Size([1792]) False 2077 model.gen_vision_tower.vision_tower.model.blocks.38.attn.qkv.weight torch.Size([5376, 1792]) False 2078 model.gen_vision_tower.vision_tower.model.blocks.38.attn.qkv.bias torch.Size([5376]) False 2079 model.gen_vision_tower.vision_tower.model.blocks.38.attn.proj.weight torch.Size([1792, 1792]) False 2080 model.gen_vision_tower.vision_tower.model.blocks.38.attn.proj.bias torch.Size([1792]) False 2081 model.gen_vision_tower.vision_tower.model.blocks.38.norm2.weight torch.Size([1792]) False 2082 model.gen_vision_tower.vision_tower.model.blocks.38.norm2.bias torch.Size([1792]) False 2083 model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc1.weight torch.Size([15360, 1792]) False 2084 model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc1.bias torch.Size([15360]) False 2085 model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc2.weight torch.Size([1792, 15360]) False 2086 model.gen_vision_tower.vision_tower.model.blocks.38.mlp.fc2.bias torch.Size([1792]) False 2087 model.gen_vision_tower.vision_tower.model.blocks.39.norm1.weight torch.Size([1792]) False 2088 model.gen_vision_tower.vision_tower.model.blocks.39.norm1.bias torch.Size([1792]) False 2089 model.gen_vision_tower.vision_tower.model.blocks.39.attn.qkv.weight torch.Size([5376, 1792]) False 2090 model.gen_vision_tower.vision_tower.model.blocks.39.attn.qkv.bias torch.Size([5376]) False 2091 model.gen_vision_tower.vision_tower.model.blocks.39.attn.proj.weight torch.Size([1792, 1792]) False 2092 model.gen_vision_tower.vision_tower.model.blocks.39.attn.proj.bias torch.Size([1792]) False 2093 model.gen_vision_tower.vision_tower.model.blocks.39.norm2.weight torch.Size([1792]) False 2094 model.gen_vision_tower.vision_tower.model.blocks.39.norm2.bias torch.Size([1792]) False 2095 model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc1.weight torch.Size([15360, 1792]) False 2096 model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc1.bias torch.Size([15360]) False 2097 model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc2.weight torch.Size([1792, 15360]) False 2098 model.gen_vision_tower.vision_tower.model.blocks.39.mlp.fc2.bias torch.Size([1792]) False 2099 model.gen_vision_tower.vision_tower.model.blocks.40.norm1.weight torch.Size([1792]) False 2100 model.gen_vision_tower.vision_tower.model.blocks.40.norm1.bias torch.Size([1792]) False 2101 model.gen_vision_tower.vision_tower.model.blocks.40.attn.qkv.weight torch.Size([5376, 1792]) False 2102 model.gen_vision_tower.vision_tower.model.blocks.40.attn.qkv.bias torch.Size([5376]) False 2103 model.gen_vision_tower.vision_tower.model.blocks.40.attn.proj.weight torch.Size([1792, 1792]) False 2104 model.gen_vision_tower.vision_tower.model.blocks.40.attn.proj.bias torch.Size([1792]) False 2105 model.gen_vision_tower.vision_tower.model.blocks.40.norm2.weight torch.Size([1792]) False 2106 model.gen_vision_tower.vision_tower.model.blocks.40.norm2.bias torch.Size([1792]) False 2107 model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc1.weight torch.Size([15360, 1792]) False 2108 model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc1.bias torch.Size([15360]) False 2109 model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc2.weight torch.Size([1792, 15360]) False 2110 model.gen_vision_tower.vision_tower.model.blocks.40.mlp.fc2.bias torch.Size([1792]) False 2111 model.gen_vision_tower.vision_tower.model.blocks.41.norm1.weight torch.Size([1792]) False 2112 model.gen_vision_tower.vision_tower.model.blocks.41.norm1.bias torch.Size([1792]) False 2113 model.gen_vision_tower.vision_tower.model.blocks.41.attn.qkv.weight torch.Size([5376, 1792]) False 2114 model.gen_vision_tower.vision_tower.model.blocks.41.attn.qkv.bias torch.Size([5376]) False 2115 model.gen_vision_tower.vision_tower.model.blocks.41.attn.proj.weight torch.Size([1792, 1792]) False 2116 model.gen_vision_tower.vision_tower.model.blocks.41.attn.proj.bias torch.Size([1792]) False 2117 model.gen_vision_tower.vision_tower.model.blocks.41.norm2.weight torch.Size([1792]) False 2118 model.gen_vision_tower.vision_tower.model.blocks.41.norm2.bias torch.Size([1792]) False 2119 model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc1.weight torch.Size([15360, 1792]) False 2120 model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc1.bias torch.Size([15360]) False 2121 model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc2.weight torch.Size([1792, 15360]) False 2122 model.gen_vision_tower.vision_tower.model.blocks.41.mlp.fc2.bias torch.Size([1792]) False 2123 model.gen_vision_tower.vision_tower.model.blocks.42.norm1.weight torch.Size([1792]) False 2124 model.gen_vision_tower.vision_tower.model.blocks.42.norm1.bias torch.Size([1792]) False 2125 model.gen_vision_tower.vision_tower.model.blocks.42.attn.qkv.weight torch.Size([5376, 1792]) False 2126 model.gen_vision_tower.vision_tower.model.blocks.42.attn.qkv.bias torch.Size([5376]) False 2127 model.gen_vision_tower.vision_tower.model.blocks.42.attn.proj.weight torch.Size([1792, 1792]) False 2128 model.gen_vision_tower.vision_tower.model.blocks.42.attn.proj.bias torch.Size([1792]) False 2129 model.gen_vision_tower.vision_tower.model.blocks.42.norm2.weight torch.Size([1792]) False 2130 model.gen_vision_tower.vision_tower.model.blocks.42.norm2.bias torch.Size([1792]) False 2131 model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc1.weight torch.Size([15360, 1792]) False 2132 model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc1.bias torch.Size([15360]) False 2133 model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc2.weight torch.Size([1792, 15360]) False 2134 model.gen_vision_tower.vision_tower.model.blocks.42.mlp.fc2.bias torch.Size([1792]) False 2135 model.gen_vision_tower.vision_tower.model.blocks.43.norm1.weight torch.Size([1792]) False 2136 model.gen_vision_tower.vision_tower.model.blocks.43.norm1.bias torch.Size([1792]) False 2137 model.gen_vision_tower.vision_tower.model.blocks.43.attn.qkv.weight torch.Size([5376, 1792]) False 2138 model.gen_vision_tower.vision_tower.model.blocks.43.attn.qkv.bias torch.Size([5376]) False 2139 model.gen_vision_tower.vision_tower.model.blocks.43.attn.proj.weight torch.Size([1792, 1792]) False 2140 model.gen_vision_tower.vision_tower.model.blocks.43.attn.proj.bias torch.Size([1792]) False 2141 model.gen_vision_tower.vision_tower.model.blocks.43.norm2.weight torch.Size([1792]) False 2142 model.gen_vision_tower.vision_tower.model.blocks.43.norm2.bias torch.Size([1792]) False 2143 model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc1.weight torch.Size([15360, 1792]) False 2144 model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc1.bias torch.Size([15360]) False 2145 model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc2.weight torch.Size([1792, 15360]) False 2146 model.gen_vision_tower.vision_tower.model.blocks.43.mlp.fc2.bias torch.Size([1792]) False 2147 model.gen_vision_tower.vision_tower.model.blocks.44.norm1.weight torch.Size([1792]) False 2148 model.gen_vision_tower.vision_tower.model.blocks.44.norm1.bias torch.Size([1792]) False 2149 model.gen_vision_tower.vision_tower.model.blocks.44.attn.qkv.weight torch.Size([5376, 1792]) False 2150 model.gen_vision_tower.vision_tower.model.blocks.44.attn.qkv.bias torch.Size([5376]) False 2151 model.gen_vision_tower.vision_tower.model.blocks.44.attn.proj.weight torch.Size([1792, 1792]) False 2152 model.gen_vision_tower.vision_tower.model.blocks.44.attn.proj.bias torch.Size([1792]) False 2153 model.gen_vision_tower.vision_tower.model.blocks.44.norm2.weight torch.Size([1792]) False 2154 model.gen_vision_tower.vision_tower.model.blocks.44.norm2.bias torch.Size([1792]) False 2155 model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc1.weight torch.Size([15360, 1792]) False 2156 model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc1.bias torch.Size([15360]) False 2157 model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc2.weight torch.Size([1792, 15360]) False 2158 model.gen_vision_tower.vision_tower.model.blocks.44.mlp.fc2.bias torch.Size([1792]) False 2159 model.gen_vision_tower.vision_tower.model.blocks.45.norm1.weight torch.Size([1792]) False 2160 model.gen_vision_tower.vision_tower.model.blocks.45.norm1.bias torch.Size([1792]) False 2161 model.gen_vision_tower.vision_tower.model.blocks.45.attn.qkv.weight torch.Size([5376, 1792]) False 2162 model.gen_vision_tower.vision_tower.model.blocks.45.attn.qkv.bias torch.Size([5376]) False 2163 model.gen_vision_tower.vision_tower.model.blocks.45.attn.proj.weight torch.Size([1792, 1792]) False 2164 model.gen_vision_tower.vision_tower.model.blocks.45.attn.proj.bias torch.Size([1792]) False 2165 model.gen_vision_tower.vision_tower.model.blocks.45.norm2.weight torch.Size([1792]) False 2166 model.gen_vision_tower.vision_tower.model.blocks.45.norm2.bias torch.Size([1792]) False 2167 model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc1.weight torch.Size([15360, 1792]) False 2168 model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc1.bias torch.Size([15360]) False 2169 model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc2.weight torch.Size([1792, 15360]) False 2170 model.gen_vision_tower.vision_tower.model.blocks.45.mlp.fc2.bias torch.Size([1792]) False 2171 model.gen_vision_tower.vision_tower.model.blocks.46.norm1.weight torch.Size([1792]) False 2172 model.gen_vision_tower.vision_tower.model.blocks.46.norm1.bias torch.Size([1792]) False 2173 model.gen_vision_tower.vision_tower.model.blocks.46.attn.qkv.weight torch.Size([5376, 1792]) False 2174 model.gen_vision_tower.vision_tower.model.blocks.46.attn.qkv.bias torch.Size([5376]) False 2175 model.gen_vision_tower.vision_tower.model.blocks.46.attn.proj.weight torch.Size([1792, 1792]) False 2176 model.gen_vision_tower.vision_tower.model.blocks.46.attn.proj.bias torch.Size([1792]) False 2177 model.gen_vision_tower.vision_tower.model.blocks.46.norm2.weight torch.Size([1792]) False 2178 model.gen_vision_tower.vision_tower.model.blocks.46.norm2.bias torch.Size([1792]) False 2179 model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc1.weight torch.Size([15360, 1792]) False 2180 model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc1.bias torch.Size([15360]) False 2181 model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc2.weight torch.Size([1792, 15360]) False 2182 model.gen_vision_tower.vision_tower.model.blocks.46.mlp.fc2.bias torch.Size([1792]) False 2183 model.gen_vision_tower.vision_tower.model.blocks.47.norm1.weight torch.Size([1792]) False 2184 model.gen_vision_tower.vision_tower.model.blocks.47.norm1.bias torch.Size([1792]) False 2185 model.gen_vision_tower.vision_tower.model.blocks.47.attn.qkv.weight torch.Size([5376, 1792]) False 2186 model.gen_vision_tower.vision_tower.model.blocks.47.attn.qkv.bias torch.Size([5376]) False 2187 model.gen_vision_tower.vision_tower.model.blocks.47.attn.proj.weight torch.Size([1792, 1792]) False 2188 model.gen_vision_tower.vision_tower.model.blocks.47.attn.proj.bias torch.Size([1792]) False 2189 model.gen_vision_tower.vision_tower.model.blocks.47.norm2.weight torch.Size([1792]) False 2190 model.gen_vision_tower.vision_tower.model.blocks.47.norm2.bias torch.Size([1792]) False 2191 model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc1.weight torch.Size([15360, 1792]) False 2192 model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc1.bias torch.Size([15360]) False 2193 model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc2.weight torch.Size([1792, 15360]) False 2194 model.gen_vision_tower.vision_tower.model.blocks.47.mlp.fc2.bias torch.Size([1792]) False 2195 model.gen_vision_tower.vision_tower.model.blocks.48.norm1.weight torch.Size([1792]) False 2196 model.gen_vision_tower.vision_tower.model.blocks.48.norm1.bias torch.Size([1792]) False 2197 model.gen_vision_tower.vision_tower.model.blocks.48.attn.qkv.weight torch.Size([5376, 1792]) False 2198 model.gen_vision_tower.vision_tower.model.blocks.48.attn.qkv.bias torch.Size([5376]) False 2199 model.gen_vision_tower.vision_tower.model.blocks.48.attn.proj.weight torch.Size([1792, 1792]) False 2200 model.gen_vision_tower.vision_tower.model.blocks.48.attn.proj.bias torch.Size([1792]) False 2201 model.gen_vision_tower.vision_tower.model.blocks.48.norm2.weight torch.Size([1792]) False 2202 model.gen_vision_tower.vision_tower.model.blocks.48.norm2.bias torch.Size([1792]) False 2203 model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc1.weight torch.Size([15360, 1792]) False 2204 model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc1.bias torch.Size([15360]) False 2205 model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc2.weight torch.Size([1792, 15360]) False 2206 model.gen_vision_tower.vision_tower.model.blocks.48.mlp.fc2.bias torch.Size([1792]) False 2207 model.gen_vision_tower.vision_tower.model.blocks.49.norm1.weight torch.Size([1792]) False 2208 model.gen_vision_tower.vision_tower.model.blocks.49.norm1.bias torch.Size([1792]) False 2209 model.gen_vision_tower.vision_tower.model.blocks.49.attn.qkv.weight torch.Size([5376, 1792]) False 2210 model.gen_vision_tower.vision_tower.model.blocks.49.attn.qkv.bias torch.Size([5376]) False 2211 model.gen_vision_tower.vision_tower.model.blocks.49.attn.proj.weight torch.Size([1792, 1792]) False 2212 model.gen_vision_tower.vision_tower.model.blocks.49.attn.proj.bias torch.Size([1792]) False 2213 model.gen_vision_tower.vision_tower.model.blocks.49.norm2.weight torch.Size([1792]) False 2214 model.gen_vision_tower.vision_tower.model.blocks.49.norm2.bias torch.Size([1792]) False 2215 model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc1.weight torch.Size([15360, 1792]) False 2216 model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc1.bias torch.Size([15360]) False 2217 model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc2.weight torch.Size([1792, 15360]) False 2218 model.gen_vision_tower.vision_tower.model.blocks.49.mlp.fc2.bias torch.Size([1792]) False 2219 model.gen_vision_tower.vision_tower.model.blocks.50.norm1.weight torch.Size([1792]) False 2220 model.gen_vision_tower.vision_tower.model.blocks.50.norm1.bias torch.Size([1792]) False 2221 model.gen_vision_tower.vision_tower.model.blocks.50.attn.qkv.weight torch.Size([5376, 1792]) False 2222 model.gen_vision_tower.vision_tower.model.blocks.50.attn.qkv.bias torch.Size([5376]) False 2223 model.gen_vision_tower.vision_tower.model.blocks.50.attn.proj.weight torch.Size([1792, 1792]) False 2224 model.gen_vision_tower.vision_tower.model.blocks.50.attn.proj.bias torch.Size([1792]) False 2225 model.gen_vision_tower.vision_tower.model.blocks.50.norm2.weight torch.Size([1792]) False 2226 model.gen_vision_tower.vision_tower.model.blocks.50.norm2.bias torch.Size([1792]) False 2227 model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc1.weight torch.Size([15360, 1792]) False 2228 model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc1.bias torch.Size([15360]) False 2229 model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc2.weight torch.Size([1792, 15360]) False 2230 model.gen_vision_tower.vision_tower.model.blocks.50.mlp.fc2.bias torch.Size([1792]) False 2231 model.gen_vision_tower.vision_tower.model.blocks.51.norm1.weight torch.Size([1792]) False 2232 model.gen_vision_tower.vision_tower.model.blocks.51.norm1.bias torch.Size([1792]) False 2233 model.gen_vision_tower.vision_tower.model.blocks.51.attn.qkv.weight torch.Size([5376, 1792]) False 2234 model.gen_vision_tower.vision_tower.model.blocks.51.attn.qkv.bias torch.Size([5376]) False 2235 model.gen_vision_tower.vision_tower.model.blocks.51.attn.proj.weight torch.Size([1792, 1792]) False 2236 model.gen_vision_tower.vision_tower.model.blocks.51.attn.proj.bias torch.Size([1792]) False 2237 model.gen_vision_tower.vision_tower.model.blocks.51.norm2.weight torch.Size([1792]) False 2238 model.gen_vision_tower.vision_tower.model.blocks.51.norm2.bias torch.Size([1792]) False 2239 model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc1.weight torch.Size([15360, 1792]) False 2240 model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc1.bias torch.Size([15360]) False 2241 model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc2.weight torch.Size([1792, 15360]) False 2242 model.gen_vision_tower.vision_tower.model.blocks.51.mlp.fc2.bias torch.Size([1792]) False 2243 model.gen_vision_tower.vision_tower.model.blocks.52.norm1.weight torch.Size([1792]) False 2244 model.gen_vision_tower.vision_tower.model.blocks.52.norm1.bias torch.Size([1792]) False 2245 model.gen_vision_tower.vision_tower.model.blocks.52.attn.qkv.weight torch.Size([5376, 1792]) False 2246 model.gen_vision_tower.vision_tower.model.blocks.52.attn.qkv.bias torch.Size([5376]) False 2247 model.gen_vision_tower.vision_tower.model.blocks.52.attn.proj.weight torch.Size([1792, 1792]) False 2248 model.gen_vision_tower.vision_tower.model.blocks.52.attn.proj.bias torch.Size([1792]) False 2249 model.gen_vision_tower.vision_tower.model.blocks.52.norm2.weight torch.Size([1792]) False 2250 model.gen_vision_tower.vision_tower.model.blocks.52.norm2.bias torch.Size([1792]) False 2251 model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc1.weight torch.Size([15360, 1792]) False 2252 model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc1.bias torch.Size([15360]) False 2253 model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc2.weight torch.Size([1792, 15360]) False 2254 model.gen_vision_tower.vision_tower.model.blocks.52.mlp.fc2.bias torch.Size([1792]) False 2255 model.gen_vision_tower.vision_tower.model.blocks.53.norm1.weight torch.Size([1792]) False 2256 model.gen_vision_tower.vision_tower.model.blocks.53.norm1.bias torch.Size([1792]) False 2257 model.gen_vision_tower.vision_tower.model.blocks.53.attn.qkv.weight torch.Size([5376, 1792]) False 2258 model.gen_vision_tower.vision_tower.model.blocks.53.attn.qkv.bias torch.Size([5376]) False 2259 model.gen_vision_tower.vision_tower.model.blocks.53.attn.proj.weight torch.Size([1792, 1792]) False 2260 model.gen_vision_tower.vision_tower.model.blocks.53.attn.proj.bias torch.Size([1792]) False 2261 model.gen_vision_tower.vision_tower.model.blocks.53.norm2.weight torch.Size([1792]) False 2262 model.gen_vision_tower.vision_tower.model.blocks.53.norm2.bias torch.Size([1792]) False 2263 model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc1.weight torch.Size([15360, 1792]) False 2264 model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc1.bias torch.Size([15360]) False 2265 model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc2.weight torch.Size([1792, 15360]) False 2266 model.gen_vision_tower.vision_tower.model.blocks.53.mlp.fc2.bias torch.Size([1792]) False 2267 model.gen_vision_tower.vision_tower.model.blocks.54.norm1.weight torch.Size([1792]) False 2268 model.gen_vision_tower.vision_tower.model.blocks.54.norm1.bias torch.Size([1792]) False 2269 model.gen_vision_tower.vision_tower.model.blocks.54.attn.qkv.weight torch.Size([5376, 1792]) False 2270 model.gen_vision_tower.vision_tower.model.blocks.54.attn.qkv.bias torch.Size([5376]) False 2271 model.gen_vision_tower.vision_tower.model.blocks.54.attn.proj.weight torch.Size([1792, 1792]) False 2272 model.gen_vision_tower.vision_tower.model.blocks.54.attn.proj.bias torch.Size([1792]) False 2273 model.gen_vision_tower.vision_tower.model.blocks.54.norm2.weight torch.Size([1792]) False 2274 model.gen_vision_tower.vision_tower.model.blocks.54.norm2.bias torch.Size([1792]) False 2275 model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc1.weight torch.Size([15360, 1792]) False 2276 model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc1.bias torch.Size([15360]) False 2277 model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc2.weight torch.Size([1792, 15360]) False 2278 model.gen_vision_tower.vision_tower.model.blocks.54.mlp.fc2.bias torch.Size([1792]) False 2279 model.gen_vision_tower.vision_tower.model.blocks.55.norm1.weight torch.Size([1792]) False 2280 model.gen_vision_tower.vision_tower.model.blocks.55.norm1.bias torch.Size([1792]) False 2281 model.gen_vision_tower.vision_tower.model.blocks.55.attn.qkv.weight torch.Size([5376, 1792]) False 2282 model.gen_vision_tower.vision_tower.model.blocks.55.attn.qkv.bias torch.Size([5376]) False 2283 model.gen_vision_tower.vision_tower.model.blocks.55.attn.proj.weight torch.Size([1792, 1792]) False 2284 model.gen_vision_tower.vision_tower.model.blocks.55.attn.proj.bias torch.Size([1792]) False 2285 model.gen_vision_tower.vision_tower.model.blocks.55.norm2.weight torch.Size([1792]) False 2286 model.gen_vision_tower.vision_tower.model.blocks.55.norm2.bias torch.Size([1792]) False 2287 model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc1.weight torch.Size([15360, 1792]) False 2288 model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc1.bias torch.Size([15360]) False 2289 model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc2.weight torch.Size([1792, 15360]) False 2290 model.gen_vision_tower.vision_tower.model.blocks.55.mlp.fc2.bias torch.Size([1792]) False 2291 model.gen_vision_tower.vision_tower.model.blocks.56.norm1.weight torch.Size([1792]) False 2292 model.gen_vision_tower.vision_tower.model.blocks.56.norm1.bias torch.Size([1792]) False 2293 model.gen_vision_tower.vision_tower.model.blocks.56.attn.qkv.weight torch.Size([5376, 1792]) False 2294 model.gen_vision_tower.vision_tower.model.blocks.56.attn.qkv.bias torch.Size([5376]) False 2295 model.gen_vision_tower.vision_tower.model.blocks.56.attn.proj.weight torch.Size([1792, 1792]) False 2296 model.gen_vision_tower.vision_tower.model.blocks.56.attn.proj.bias torch.Size([1792]) False 2297 model.gen_vision_tower.vision_tower.model.blocks.56.norm2.weight torch.Size([1792]) False 2298 model.gen_vision_tower.vision_tower.model.blocks.56.norm2.bias torch.Size([1792]) False 2299 model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc1.weight torch.Size([15360, 1792]) False 2300 model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc1.bias torch.Size([15360]) False 2301 model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc2.weight torch.Size([1792, 15360]) False 2302 model.gen_vision_tower.vision_tower.model.blocks.56.mlp.fc2.bias torch.Size([1792]) False 2303 model.gen_vision_tower.vision_tower.model.blocks.57.norm1.weight torch.Size([1792]) False 2304 model.gen_vision_tower.vision_tower.model.blocks.57.norm1.bias torch.Size([1792]) False 2305 model.gen_vision_tower.vision_tower.model.blocks.57.attn.qkv.weight torch.Size([5376, 1792]) False 2306 model.gen_vision_tower.vision_tower.model.blocks.57.attn.qkv.bias torch.Size([5376]) False 2307 model.gen_vision_tower.vision_tower.model.blocks.57.attn.proj.weight torch.Size([1792, 1792]) False 2308 model.gen_vision_tower.vision_tower.model.blocks.57.attn.proj.bias torch.Size([1792]) False 2309 model.gen_vision_tower.vision_tower.model.blocks.57.norm2.weight torch.Size([1792]) False 2310 model.gen_vision_tower.vision_tower.model.blocks.57.norm2.bias torch.Size([1792]) False 2311 model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc1.weight torch.Size([15360, 1792]) False 2312 model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc1.bias torch.Size([15360]) False 2313 model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc2.weight torch.Size([1792, 15360]) False 2314 model.gen_vision_tower.vision_tower.model.blocks.57.mlp.fc2.bias torch.Size([1792]) False 2315 model.gen_vision_tower.vision_tower.model.blocks.58.norm1.weight torch.Size([1792]) False 2316 model.gen_vision_tower.vision_tower.model.blocks.58.norm1.bias torch.Size([1792]) False 2317 model.gen_vision_tower.vision_tower.model.blocks.58.attn.qkv.weight torch.Size([5376, 1792]) False 2318 model.gen_vision_tower.vision_tower.model.blocks.58.attn.qkv.bias torch.Size([5376]) False 2319 model.gen_vision_tower.vision_tower.model.blocks.58.attn.proj.weight torch.Size([1792, 1792]) False 2320 model.gen_vision_tower.vision_tower.model.blocks.58.attn.proj.bias torch.Size([1792]) False 2321 model.gen_vision_tower.vision_tower.model.blocks.58.norm2.weight torch.Size([1792]) False 2322 model.gen_vision_tower.vision_tower.model.blocks.58.norm2.bias torch.Size([1792]) False 2323 model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc1.weight torch.Size([15360, 1792]) False 2324 model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc1.bias torch.Size([15360]) False 2325 model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc2.weight torch.Size([1792, 15360]) False 2326 model.gen_vision_tower.vision_tower.model.blocks.58.mlp.fc2.bias torch.Size([1792]) False 2327 model.gen_vision_tower.vision_tower.model.blocks.59.norm1.weight torch.Size([1792]) False 2328 model.gen_vision_tower.vision_tower.model.blocks.59.norm1.bias torch.Size([1792]) False 2329 model.gen_vision_tower.vision_tower.model.blocks.59.attn.qkv.weight torch.Size([5376, 1792]) False 2330 model.gen_vision_tower.vision_tower.model.blocks.59.attn.qkv.bias torch.Size([5376]) False 2331 model.gen_vision_tower.vision_tower.model.blocks.59.attn.proj.weight torch.Size([1792, 1792]) False 2332 model.gen_vision_tower.vision_tower.model.blocks.59.attn.proj.bias torch.Size([1792]) False 2333 model.gen_vision_tower.vision_tower.model.blocks.59.norm2.weight torch.Size([1792]) False 2334 model.gen_vision_tower.vision_tower.model.blocks.59.norm2.bias torch.Size([1792]) False 2335 model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc1.weight torch.Size([15360, 1792]) False 2336 model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc1.bias torch.Size([15360]) False 2337 model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc2.weight torch.Size([1792, 15360]) False 2338 model.gen_vision_tower.vision_tower.model.blocks.59.mlp.fc2.bias torch.Size([1792]) False 2339 model.gen_vision_tower.vision_tower.model.blocks.60.norm1.weight torch.Size([1792]) False 2340 model.gen_vision_tower.vision_tower.model.blocks.60.norm1.bias torch.Size([1792]) False 2341 model.gen_vision_tower.vision_tower.model.blocks.60.attn.qkv.weight torch.Size([5376, 1792]) False 2342 model.gen_vision_tower.vision_tower.model.blocks.60.attn.qkv.bias torch.Size([5376]) False 2343 model.gen_vision_tower.vision_tower.model.blocks.60.attn.proj.weight torch.Size([1792, 1792]) False 2344 model.gen_vision_tower.vision_tower.model.blocks.60.attn.proj.bias torch.Size([1792]) False 2345 model.gen_vision_tower.vision_tower.model.blocks.60.norm2.weight torch.Size([1792]) False 2346 model.gen_vision_tower.vision_tower.model.blocks.60.norm2.bias torch.Size([1792]) False 2347 model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc1.weight torch.Size([15360, 1792]) False 2348 model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc1.bias torch.Size([15360]) False 2349 model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc2.weight torch.Size([1792, 15360]) False 2350 model.gen_vision_tower.vision_tower.model.blocks.60.mlp.fc2.bias torch.Size([1792]) False 2351 model.gen_vision_tower.vision_tower.model.blocks.61.norm1.weight torch.Size([1792]) False 2352 model.gen_vision_tower.vision_tower.model.blocks.61.norm1.bias torch.Size([1792]) False 2353 model.gen_vision_tower.vision_tower.model.blocks.61.attn.qkv.weight torch.Size([5376, 1792]) False 2354 model.gen_vision_tower.vision_tower.model.blocks.61.attn.qkv.bias torch.Size([5376]) False 2355 model.gen_vision_tower.vision_tower.model.blocks.61.attn.proj.weight torch.Size([1792, 1792]) False 2356 model.gen_vision_tower.vision_tower.model.blocks.61.attn.proj.bias torch.Size([1792]) False 2357 model.gen_vision_tower.vision_tower.model.blocks.61.norm2.weight torch.Size([1792]) False 2358 model.gen_vision_tower.vision_tower.model.blocks.61.norm2.bias torch.Size([1792]) False 2359 model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc1.weight torch.Size([15360, 1792]) False 2360 model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc1.bias torch.Size([15360]) False 2361 model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc2.weight torch.Size([1792, 15360]) False 2362 model.gen_vision_tower.vision_tower.model.blocks.61.mlp.fc2.bias torch.Size([1792]) False 2363 model.gen_vision_tower.vision_tower.model.blocks.62.norm1.weight torch.Size([1792]) False 2364 model.gen_vision_tower.vision_tower.model.blocks.62.norm1.bias torch.Size([1792]) False 2365 model.gen_vision_tower.vision_tower.model.blocks.62.attn.qkv.weight torch.Size([5376, 1792]) False 2366 model.gen_vision_tower.vision_tower.model.blocks.62.attn.qkv.bias torch.Size([5376]) False 2367 model.gen_vision_tower.vision_tower.model.blocks.62.attn.proj.weight torch.Size([1792, 1792]) False 2368 model.gen_vision_tower.vision_tower.model.blocks.62.attn.proj.bias torch.Size([1792]) False 2369 model.gen_vision_tower.vision_tower.model.blocks.62.norm2.weight torch.Size([1792]) False 2370 model.gen_vision_tower.vision_tower.model.blocks.62.norm2.bias torch.Size([1792]) False 2371 model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc1.weight torch.Size([15360, 1792]) False 2372 model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc1.bias torch.Size([15360]) False 2373 model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc2.weight torch.Size([1792, 15360]) False 2374 model.gen_vision_tower.vision_tower.model.blocks.62.mlp.fc2.bias torch.Size([1792]) False 2375 model.gen_vision_tower.vision_tower.model.blocks.63.norm1.weight torch.Size([1792]) False 2376 model.gen_vision_tower.vision_tower.model.blocks.63.norm1.bias torch.Size([1792]) False 2377 model.gen_vision_tower.vision_tower.model.blocks.63.attn.qkv.weight torch.Size([5376, 1792]) False 2378 model.gen_vision_tower.vision_tower.model.blocks.63.attn.qkv.bias torch.Size([5376]) False 2379 model.gen_vision_tower.vision_tower.model.blocks.63.attn.proj.weight torch.Size([1792, 1792]) False 2380 model.gen_vision_tower.vision_tower.model.blocks.63.attn.proj.bias torch.Size([1792]) False 2381 model.gen_vision_tower.vision_tower.model.blocks.63.norm2.weight torch.Size([1792]) False 2382 model.gen_vision_tower.vision_tower.model.blocks.63.norm2.bias torch.Size([1792]) False 2383 model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc1.weight torch.Size([15360, 1792]) False 2384 model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc1.bias torch.Size([15360]) False 2385 model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc2.weight torch.Size([1792, 15360]) False 2386 model.gen_vision_tower.vision_tower.model.blocks.63.mlp.fc2.bias torch.Size([1792]) False 2387 lm_head.weight torch.Size([151668, 3584]) False Totoal number of training instance: 194420624 Totoal number of training instance: 194420624 Totoal number of training instance: 194420624 Totoal number of training instance: 194420624 Totoal number of training instance: 194420624 Totoal number of training instance: 194420624